diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,140034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.32, + "eval_steps": 500, + "global_step": 20000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1.6e-05, + "grad_norm": 0.7421875, + "learning_rate": 0.0, + "loss": 0.174, + "step": 1 + }, + { + "epoch": 3.2e-05, + "grad_norm": 1.1328125, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.1894, + "step": 2 + }, + { + "epoch": 4.8e-05, + "grad_norm": 0.64453125, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.1369, + "step": 3 + }, + { + "epoch": 6.4e-05, + "grad_norm": 0.7890625, + "learning_rate": 6.000000000000001e-07, + "loss": 0.1449, + "step": 4 + }, + { + "epoch": 8e-05, + "grad_norm": 0.88671875, + "learning_rate": 8.000000000000001e-07, + "loss": 0.1653, + "step": 5 + }, + { + "epoch": 9.6e-05, + "grad_norm": 0.80859375, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.152, + "step": 6 + }, + { + "epoch": 0.000112, + "grad_norm": 1.0703125, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.167, + "step": 7 + }, + { + "epoch": 0.000128, + "grad_norm": 0.953125, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.1683, + "step": 8 + }, + { + "epoch": 0.000144, + "grad_norm": 0.99609375, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.1367, + "step": 9 + }, + { + "epoch": 0.00016, + "grad_norm": 1.03125, + "learning_rate": 1.8e-06, + "loss": 0.1824, + "step": 10 + }, + { + "epoch": 0.000176, + "grad_norm": 0.72265625, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.133, + "step": 11 + }, + { + "epoch": 0.000192, + "grad_norm": 1.171875, + "learning_rate": 2.2e-06, + "loss": 0.175, + "step": 12 + }, + { + "epoch": 0.000208, + "grad_norm": 0.77734375, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.1658, + "step": 13 + }, + { + "epoch": 0.000224, + "grad_norm": 0.74609375, + "learning_rate": 2.6e-06, + "loss": 0.1192, + "step": 14 + }, + { + "epoch": 0.00024, + "grad_norm": 1.1171875, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.1685, + "step": 15 + }, + { + "epoch": 0.000256, + "grad_norm": 0.7421875, + "learning_rate": 3e-06, + "loss": 0.1557, + "step": 16 + }, + { + "epoch": 0.000272, + "grad_norm": 0.76171875, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.1482, + "step": 17 + }, + { + "epoch": 0.000288, + "grad_norm": 0.703125, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.2086, + "step": 18 + }, + { + "epoch": 0.000304, + "grad_norm": 0.9375, + "learning_rate": 3.6e-06, + "loss": 0.1731, + "step": 19 + }, + { + "epoch": 0.00032, + "grad_norm": 1.0234375, + "learning_rate": 3.8e-06, + "loss": 0.1784, + "step": 20 + }, + { + "epoch": 0.000336, + "grad_norm": 0.7890625, + "learning_rate": 4.000000000000001e-06, + "loss": 0.1804, + "step": 21 + }, + { + "epoch": 0.000352, + "grad_norm": 0.921875, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.1808, + "step": 22 + }, + { + "epoch": 0.000368, + "grad_norm": 0.8125, + "learning_rate": 4.4e-06, + "loss": 0.1685, + "step": 23 + }, + { + "epoch": 0.000384, + "grad_norm": 0.8359375, + "learning_rate": 4.6e-06, + "loss": 0.2063, + "step": 24 + }, + { + "epoch": 0.0004, + "grad_norm": 1.109375, + "learning_rate": 4.800000000000001e-06, + "loss": 0.1327, + "step": 25 + }, + { + "epoch": 0.000416, + "grad_norm": 1.28125, + "learning_rate": 5e-06, + "loss": 0.1434, + "step": 26 + }, + { + "epoch": 0.000432, + "grad_norm": 0.64453125, + "learning_rate": 5.2e-06, + "loss": 0.1345, + "step": 27 + }, + { + "epoch": 0.000448, + "grad_norm": 0.77734375, + "learning_rate": 5.4e-06, + "loss": 0.1699, + "step": 28 + }, + { + "epoch": 0.000464, + "grad_norm": 1.0078125, + "learning_rate": 5.600000000000001e-06, + "loss": 0.2125, + "step": 29 + }, + { + "epoch": 0.00048, + "grad_norm": 1.078125, + "learning_rate": 5.8e-06, + "loss": 0.1713, + "step": 30 + }, + { + "epoch": 0.000496, + "grad_norm": 1.1484375, + "learning_rate": 6e-06, + "loss": 0.1814, + "step": 31 + }, + { + "epoch": 0.000512, + "grad_norm": 0.8359375, + "learning_rate": 6.2e-06, + "loss": 0.158, + "step": 32 + }, + { + "epoch": 0.000528, + "grad_norm": 1.3984375, + "learning_rate": 6.4000000000000006e-06, + "loss": 0.1556, + "step": 33 + }, + { + "epoch": 0.000544, + "grad_norm": 0.6171875, + "learning_rate": 6.6e-06, + "loss": 0.144, + "step": 34 + }, + { + "epoch": 0.00056, + "grad_norm": 0.8984375, + "learning_rate": 6.800000000000001e-06, + "loss": 0.1726, + "step": 35 + }, + { + "epoch": 0.000576, + "grad_norm": 0.640625, + "learning_rate": 7.000000000000001e-06, + "loss": 0.123, + "step": 36 + }, + { + "epoch": 0.000592, + "grad_norm": 1.1640625, + "learning_rate": 7.2e-06, + "loss": 0.1859, + "step": 37 + }, + { + "epoch": 0.000608, + "grad_norm": 0.78515625, + "learning_rate": 7.4e-06, + "loss": 0.1612, + "step": 38 + }, + { + "epoch": 0.000624, + "grad_norm": 0.671875, + "learning_rate": 7.6e-06, + "loss": 0.1631, + "step": 39 + }, + { + "epoch": 0.00064, + "grad_norm": 1.046875, + "learning_rate": 7.8e-06, + "loss": 0.1854, + "step": 40 + }, + { + "epoch": 0.000656, + "grad_norm": 1.328125, + "learning_rate": 8.000000000000001e-06, + "loss": 0.2083, + "step": 41 + }, + { + "epoch": 0.000672, + "grad_norm": 1.1484375, + "learning_rate": 8.200000000000001e-06, + "loss": 0.1447, + "step": 42 + }, + { + "epoch": 0.000688, + "grad_norm": 0.9140625, + "learning_rate": 8.400000000000001e-06, + "loss": 0.1509, + "step": 43 + }, + { + "epoch": 0.000704, + "grad_norm": 0.765625, + "learning_rate": 8.599999999999999e-06, + "loss": 0.1864, + "step": 44 + }, + { + "epoch": 0.00072, + "grad_norm": 0.76171875, + "learning_rate": 8.8e-06, + "loss": 0.132, + "step": 45 + }, + { + "epoch": 0.000736, + "grad_norm": 1.0625, + "learning_rate": 9e-06, + "loss": 0.1832, + "step": 46 + }, + { + "epoch": 0.000752, + "grad_norm": 0.88671875, + "learning_rate": 9.2e-06, + "loss": 0.1795, + "step": 47 + }, + { + "epoch": 0.000768, + "grad_norm": 0.875, + "learning_rate": 9.4e-06, + "loss": 0.1725, + "step": 48 + }, + { + "epoch": 0.000784, + "grad_norm": 0.81640625, + "learning_rate": 9.600000000000001e-06, + "loss": 0.1437, + "step": 49 + }, + { + "epoch": 0.0008, + "grad_norm": 0.9140625, + "learning_rate": 9.800000000000001e-06, + "loss": 0.166, + "step": 50 + }, + { + "epoch": 0.000816, + "grad_norm": 0.8046875, + "learning_rate": 1e-05, + "loss": 0.1476, + "step": 51 + }, + { + "epoch": 0.000832, + "grad_norm": 0.8046875, + "learning_rate": 1.02e-05, + "loss": 0.1428, + "step": 52 + }, + { + "epoch": 0.000848, + "grad_norm": 1.125, + "learning_rate": 1.04e-05, + "loss": 0.171, + "step": 53 + }, + { + "epoch": 0.000864, + "grad_norm": 1.3515625, + "learning_rate": 1.06e-05, + "loss": 0.1706, + "step": 54 + }, + { + "epoch": 0.00088, + "grad_norm": 0.89453125, + "learning_rate": 1.08e-05, + "loss": 0.1609, + "step": 55 + }, + { + "epoch": 0.000896, + "grad_norm": 1.03125, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.1793, + "step": 56 + }, + { + "epoch": 0.000912, + "grad_norm": 0.734375, + "learning_rate": 1.1200000000000001e-05, + "loss": 0.1266, + "step": 57 + }, + { + "epoch": 0.000928, + "grad_norm": 0.99609375, + "learning_rate": 1.1400000000000001e-05, + "loss": 0.1899, + "step": 58 + }, + { + "epoch": 0.000944, + "grad_norm": 0.8671875, + "learning_rate": 1.16e-05, + "loss": 0.1401, + "step": 59 + }, + { + "epoch": 0.00096, + "grad_norm": 1.359375, + "learning_rate": 1.18e-05, + "loss": 0.1821, + "step": 60 + }, + { + "epoch": 0.000976, + "grad_norm": 0.65625, + "learning_rate": 1.2e-05, + "loss": 0.1617, + "step": 61 + }, + { + "epoch": 0.000992, + "grad_norm": 0.89453125, + "learning_rate": 1.22e-05, + "loss": 0.1581, + "step": 62 + }, + { + "epoch": 0.001008, + "grad_norm": 0.80078125, + "learning_rate": 1.24e-05, + "loss": 0.1682, + "step": 63 + }, + { + "epoch": 0.001024, + "grad_norm": 0.9453125, + "learning_rate": 1.2600000000000001e-05, + "loss": 0.1505, + "step": 64 + }, + { + "epoch": 0.00104, + "grad_norm": 0.734375, + "learning_rate": 1.2800000000000001e-05, + "loss": 0.1694, + "step": 65 + }, + { + "epoch": 0.001056, + "grad_norm": 0.78125, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.1492, + "step": 66 + }, + { + "epoch": 0.001072, + "grad_norm": 0.68359375, + "learning_rate": 1.32e-05, + "loss": 0.1371, + "step": 67 + }, + { + "epoch": 0.001088, + "grad_norm": 0.9921875, + "learning_rate": 1.3400000000000002e-05, + "loss": 0.1462, + "step": 68 + }, + { + "epoch": 0.001104, + "grad_norm": 0.98828125, + "learning_rate": 1.3600000000000002e-05, + "loss": 0.1775, + "step": 69 + }, + { + "epoch": 0.00112, + "grad_norm": 0.734375, + "learning_rate": 1.3800000000000002e-05, + "loss": 0.1499, + "step": 70 + }, + { + "epoch": 0.001136, + "grad_norm": 0.79296875, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.2359, + "step": 71 + }, + { + "epoch": 0.001152, + "grad_norm": 0.69921875, + "learning_rate": 1.42e-05, + "loss": 0.1679, + "step": 72 + }, + { + "epoch": 0.001168, + "grad_norm": 0.73046875, + "learning_rate": 1.44e-05, + "loss": 0.1869, + "step": 73 + }, + { + "epoch": 0.001184, + "grad_norm": 1.3671875, + "learning_rate": 1.4599999999999999e-05, + "loss": 0.1773, + "step": 74 + }, + { + "epoch": 0.0012, + "grad_norm": 0.8828125, + "learning_rate": 1.48e-05, + "loss": 0.2028, + "step": 75 + }, + { + "epoch": 0.001216, + "grad_norm": 0.90625, + "learning_rate": 1.5e-05, + "loss": 0.1804, + "step": 76 + }, + { + "epoch": 0.001232, + "grad_norm": 0.83984375, + "learning_rate": 1.52e-05, + "loss": 0.1782, + "step": 77 + }, + { + "epoch": 0.001248, + "grad_norm": 0.88671875, + "learning_rate": 1.54e-05, + "loss": 0.1598, + "step": 78 + }, + { + "epoch": 0.001264, + "grad_norm": 1.1328125, + "learning_rate": 1.56e-05, + "loss": 0.1331, + "step": 79 + }, + { + "epoch": 0.00128, + "grad_norm": 1.1640625, + "learning_rate": 1.58e-05, + "loss": 0.1671, + "step": 80 + }, + { + "epoch": 0.001296, + "grad_norm": 1.515625, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.2039, + "step": 81 + }, + { + "epoch": 0.001312, + "grad_norm": 0.86328125, + "learning_rate": 1.62e-05, + "loss": 0.1371, + "step": 82 + }, + { + "epoch": 0.001328, + "grad_norm": 1.671875, + "learning_rate": 1.6400000000000002e-05, + "loss": 0.1956, + "step": 83 + }, + { + "epoch": 0.001344, + "grad_norm": 0.9375, + "learning_rate": 1.66e-05, + "loss": 0.1622, + "step": 84 + }, + { + "epoch": 0.00136, + "grad_norm": 0.92578125, + "learning_rate": 1.6800000000000002e-05, + "loss": 0.1371, + "step": 85 + }, + { + "epoch": 0.001376, + "grad_norm": 1.9296875, + "learning_rate": 1.7000000000000003e-05, + "loss": 0.1759, + "step": 86 + }, + { + "epoch": 0.001392, + "grad_norm": 1.34375, + "learning_rate": 1.7199999999999998e-05, + "loss": 0.1703, + "step": 87 + }, + { + "epoch": 0.001408, + "grad_norm": 0.78515625, + "learning_rate": 1.74e-05, + "loss": 0.1557, + "step": 88 + }, + { + "epoch": 0.001424, + "grad_norm": 1.0390625, + "learning_rate": 1.76e-05, + "loss": 0.1653, + "step": 89 + }, + { + "epoch": 0.00144, + "grad_norm": 1.046875, + "learning_rate": 1.78e-05, + "loss": 0.198, + "step": 90 + }, + { + "epoch": 0.001456, + "grad_norm": 0.953125, + "learning_rate": 1.8e-05, + "loss": 0.1726, + "step": 91 + }, + { + "epoch": 0.001472, + "grad_norm": 0.62890625, + "learning_rate": 1.8200000000000002e-05, + "loss": 0.1674, + "step": 92 + }, + { + "epoch": 0.001488, + "grad_norm": 1.1640625, + "learning_rate": 1.84e-05, + "loss": 0.1423, + "step": 93 + }, + { + "epoch": 0.001504, + "grad_norm": 0.828125, + "learning_rate": 1.86e-05, + "loss": 0.1453, + "step": 94 + }, + { + "epoch": 0.00152, + "grad_norm": 0.80859375, + "learning_rate": 1.88e-05, + "loss": 0.1467, + "step": 95 + }, + { + "epoch": 0.001536, + "grad_norm": 1.890625, + "learning_rate": 1.9e-05, + "loss": 0.1738, + "step": 96 + }, + { + "epoch": 0.001552, + "grad_norm": 0.9765625, + "learning_rate": 1.9200000000000003e-05, + "loss": 0.1828, + "step": 97 + }, + { + "epoch": 0.001568, + "grad_norm": 0.9296875, + "learning_rate": 1.94e-05, + "loss": 0.1639, + "step": 98 + }, + { + "epoch": 0.001584, + "grad_norm": 1.5, + "learning_rate": 1.9600000000000002e-05, + "loss": 0.1891, + "step": 99 + }, + { + "epoch": 0.0016, + "grad_norm": 1.0546875, + "learning_rate": 1.9800000000000004e-05, + "loss": 0.1952, + "step": 100 + }, + { + "epoch": 0.001616, + "grad_norm": 0.84375, + "learning_rate": 2e-05, + "loss": 0.1669, + "step": 101 + }, + { + "epoch": 0.001632, + "grad_norm": 0.65625, + "learning_rate": 2.0200000000000003e-05, + "loss": 0.1208, + "step": 102 + }, + { + "epoch": 0.001648, + "grad_norm": 0.98046875, + "learning_rate": 2.04e-05, + "loss": 0.1962, + "step": 103 + }, + { + "epoch": 0.001664, + "grad_norm": 0.78515625, + "learning_rate": 2.06e-05, + "loss": 0.1711, + "step": 104 + }, + { + "epoch": 0.00168, + "grad_norm": 0.8984375, + "learning_rate": 2.08e-05, + "loss": 0.147, + "step": 105 + }, + { + "epoch": 0.001696, + "grad_norm": 0.6875, + "learning_rate": 2.1e-05, + "loss": 0.1624, + "step": 106 + }, + { + "epoch": 0.001712, + "grad_norm": 0.96875, + "learning_rate": 2.12e-05, + "loss": 0.1629, + "step": 107 + }, + { + "epoch": 0.001728, + "grad_norm": 0.90234375, + "learning_rate": 2.1400000000000002e-05, + "loss": 0.1447, + "step": 108 + }, + { + "epoch": 0.001744, + "grad_norm": 1.0625, + "learning_rate": 2.16e-05, + "loss": 0.1665, + "step": 109 + }, + { + "epoch": 0.00176, + "grad_norm": 1.0859375, + "learning_rate": 2.18e-05, + "loss": 0.1938, + "step": 110 + }, + { + "epoch": 0.001776, + "grad_norm": 1.0703125, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.1281, + "step": 111 + }, + { + "epoch": 0.001792, + "grad_norm": 1.125, + "learning_rate": 2.22e-05, + "loss": 0.1806, + "step": 112 + }, + { + "epoch": 0.001808, + "grad_norm": 0.8984375, + "learning_rate": 2.2400000000000002e-05, + "loss": 0.1742, + "step": 113 + }, + { + "epoch": 0.001824, + "grad_norm": 1.5, + "learning_rate": 2.26e-05, + "loss": 0.1644, + "step": 114 + }, + { + "epoch": 0.00184, + "grad_norm": 1.28125, + "learning_rate": 2.2800000000000002e-05, + "loss": 0.1629, + "step": 115 + }, + { + "epoch": 0.001856, + "grad_norm": 1.0546875, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.1904, + "step": 116 + }, + { + "epoch": 0.001872, + "grad_norm": 0.99609375, + "learning_rate": 2.32e-05, + "loss": 0.1586, + "step": 117 + }, + { + "epoch": 0.001888, + "grad_norm": 1.0859375, + "learning_rate": 2.3400000000000003e-05, + "loss": 0.1533, + "step": 118 + }, + { + "epoch": 0.001904, + "grad_norm": 1.9765625, + "learning_rate": 2.36e-05, + "loss": 0.1656, + "step": 119 + }, + { + "epoch": 0.00192, + "grad_norm": 1.2734375, + "learning_rate": 2.38e-05, + "loss": 0.2069, + "step": 120 + }, + { + "epoch": 0.001936, + "grad_norm": 0.90234375, + "learning_rate": 2.4e-05, + "loss": 0.1847, + "step": 121 + }, + { + "epoch": 0.001952, + "grad_norm": 0.6328125, + "learning_rate": 2.4200000000000002e-05, + "loss": 0.1357, + "step": 122 + }, + { + "epoch": 0.001968, + "grad_norm": 1.3671875, + "learning_rate": 2.44e-05, + "loss": 0.1611, + "step": 123 + }, + { + "epoch": 0.001984, + "grad_norm": 0.8203125, + "learning_rate": 2.46e-05, + "loss": 0.1302, + "step": 124 + }, + { + "epoch": 0.002, + "grad_norm": 0.76171875, + "learning_rate": 2.48e-05, + "loss": 0.1717, + "step": 125 + }, + { + "epoch": 0.002016, + "grad_norm": 0.74609375, + "learning_rate": 2.5e-05, + "loss": 0.1391, + "step": 126 + }, + { + "epoch": 0.002032, + "grad_norm": 1.34375, + "learning_rate": 2.5200000000000003e-05, + "loss": 0.1786, + "step": 127 + }, + { + "epoch": 0.002048, + "grad_norm": 0.97265625, + "learning_rate": 2.54e-05, + "loss": 0.2045, + "step": 128 + }, + { + "epoch": 0.002064, + "grad_norm": 1.015625, + "learning_rate": 2.5600000000000002e-05, + "loss": 0.1627, + "step": 129 + }, + { + "epoch": 0.00208, + "grad_norm": 0.92578125, + "learning_rate": 2.58e-05, + "loss": 0.1702, + "step": 130 + }, + { + "epoch": 0.002096, + "grad_norm": 0.703125, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.1836, + "step": 131 + }, + { + "epoch": 0.002112, + "grad_norm": 1.1484375, + "learning_rate": 2.6200000000000003e-05, + "loss": 0.141, + "step": 132 + }, + { + "epoch": 0.002128, + "grad_norm": 0.796875, + "learning_rate": 2.64e-05, + "loss": 0.1592, + "step": 133 + }, + { + "epoch": 0.002144, + "grad_norm": 1.0, + "learning_rate": 2.6600000000000003e-05, + "loss": 0.1565, + "step": 134 + }, + { + "epoch": 0.00216, + "grad_norm": 1.015625, + "learning_rate": 2.6800000000000004e-05, + "loss": 0.1939, + "step": 135 + }, + { + "epoch": 0.002176, + "grad_norm": 1.1796875, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.1388, + "step": 136 + }, + { + "epoch": 0.002192, + "grad_norm": 1.2421875, + "learning_rate": 2.7200000000000004e-05, + "loss": 0.1161, + "step": 137 + }, + { + "epoch": 0.002208, + "grad_norm": 0.73046875, + "learning_rate": 2.7400000000000002e-05, + "loss": 0.1408, + "step": 138 + }, + { + "epoch": 0.002224, + "grad_norm": 1.4765625, + "learning_rate": 2.7600000000000003e-05, + "loss": 0.1761, + "step": 139 + }, + { + "epoch": 0.00224, + "grad_norm": 0.8515625, + "learning_rate": 2.7800000000000005e-05, + "loss": 0.1733, + "step": 140 + }, + { + "epoch": 0.002256, + "grad_norm": 0.7578125, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.1632, + "step": 141 + }, + { + "epoch": 0.002272, + "grad_norm": 1.0078125, + "learning_rate": 2.8199999999999998e-05, + "loss": 0.1258, + "step": 142 + }, + { + "epoch": 0.002288, + "grad_norm": 0.91796875, + "learning_rate": 2.84e-05, + "loss": 0.1679, + "step": 143 + }, + { + "epoch": 0.002304, + "grad_norm": 1.1796875, + "learning_rate": 2.86e-05, + "loss": 0.1344, + "step": 144 + }, + { + "epoch": 0.00232, + "grad_norm": 0.6640625, + "learning_rate": 2.88e-05, + "loss": 0.1395, + "step": 145 + }, + { + "epoch": 0.002336, + "grad_norm": 1.1875, + "learning_rate": 2.9e-05, + "loss": 0.1829, + "step": 146 + }, + { + "epoch": 0.002352, + "grad_norm": 0.9453125, + "learning_rate": 2.9199999999999998e-05, + "loss": 0.1236, + "step": 147 + }, + { + "epoch": 0.002368, + "grad_norm": 0.6015625, + "learning_rate": 2.94e-05, + "loss": 0.1365, + "step": 148 + }, + { + "epoch": 0.002384, + "grad_norm": 0.83203125, + "learning_rate": 2.96e-05, + "loss": 0.1918, + "step": 149 + }, + { + "epoch": 0.0024, + "grad_norm": 1.359375, + "learning_rate": 2.98e-05, + "loss": 0.1845, + "step": 150 + }, + { + "epoch": 0.002416, + "grad_norm": 1.5625, + "learning_rate": 3e-05, + "loss": 0.1969, + "step": 151 + }, + { + "epoch": 0.002432, + "grad_norm": 0.9375, + "learning_rate": 3.02e-05, + "loss": 0.2335, + "step": 152 + }, + { + "epoch": 0.002448, + "grad_norm": 1.5859375, + "learning_rate": 3.04e-05, + "loss": 0.1608, + "step": 153 + }, + { + "epoch": 0.002464, + "grad_norm": 0.96875, + "learning_rate": 3.06e-05, + "loss": 0.1574, + "step": 154 + }, + { + "epoch": 0.00248, + "grad_norm": 1.15625, + "learning_rate": 3.08e-05, + "loss": 0.1992, + "step": 155 + }, + { + "epoch": 0.002496, + "grad_norm": 1.109375, + "learning_rate": 3.1e-05, + "loss": 0.1659, + "step": 156 + }, + { + "epoch": 0.002512, + "grad_norm": 1.203125, + "learning_rate": 3.12e-05, + "loss": 0.1573, + "step": 157 + }, + { + "epoch": 0.002528, + "grad_norm": 1.15625, + "learning_rate": 3.1400000000000004e-05, + "loss": 0.1253, + "step": 158 + }, + { + "epoch": 0.002544, + "grad_norm": 0.75390625, + "learning_rate": 3.16e-05, + "loss": 0.1616, + "step": 159 + }, + { + "epoch": 0.00256, + "grad_norm": 0.91015625, + "learning_rate": 3.18e-05, + "loss": 0.166, + "step": 160 + }, + { + "epoch": 0.002576, + "grad_norm": 0.87109375, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.1635, + "step": 161 + }, + { + "epoch": 0.002592, + "grad_norm": 1.40625, + "learning_rate": 3.2200000000000003e-05, + "loss": 0.1571, + "step": 162 + }, + { + "epoch": 0.002608, + "grad_norm": 0.8671875, + "learning_rate": 3.24e-05, + "loss": 0.1529, + "step": 163 + }, + { + "epoch": 0.002624, + "grad_norm": 1.0234375, + "learning_rate": 3.26e-05, + "loss": 0.1824, + "step": 164 + }, + { + "epoch": 0.00264, + "grad_norm": 1.3984375, + "learning_rate": 3.2800000000000004e-05, + "loss": 0.1937, + "step": 165 + }, + { + "epoch": 0.002656, + "grad_norm": 1.2734375, + "learning_rate": 3.3e-05, + "loss": 0.1855, + "step": 166 + }, + { + "epoch": 0.002672, + "grad_norm": 1.0078125, + "learning_rate": 3.32e-05, + "loss": 0.1935, + "step": 167 + }, + { + "epoch": 0.002688, + "grad_norm": 0.80859375, + "learning_rate": 3.3400000000000005e-05, + "loss": 0.178, + "step": 168 + }, + { + "epoch": 0.002704, + "grad_norm": 0.87890625, + "learning_rate": 3.3600000000000004e-05, + "loss": 0.1569, + "step": 169 + }, + { + "epoch": 0.00272, + "grad_norm": 1.453125, + "learning_rate": 3.38e-05, + "loss": 0.1673, + "step": 170 + }, + { + "epoch": 0.002736, + "grad_norm": 0.90625, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.1317, + "step": 171 + }, + { + "epoch": 0.002752, + "grad_norm": 0.921875, + "learning_rate": 3.4200000000000005e-05, + "loss": 0.1505, + "step": 172 + }, + { + "epoch": 0.002768, + "grad_norm": 0.80078125, + "learning_rate": 3.4399999999999996e-05, + "loss": 0.1696, + "step": 173 + }, + { + "epoch": 0.002784, + "grad_norm": 1.0, + "learning_rate": 3.46e-05, + "loss": 0.176, + "step": 174 + }, + { + "epoch": 0.0028, + "grad_norm": 1.125, + "learning_rate": 3.48e-05, + "loss": 0.1443, + "step": 175 + }, + { + "epoch": 0.002816, + "grad_norm": 1.4140625, + "learning_rate": 3.5e-05, + "loss": 0.1985, + "step": 176 + }, + { + "epoch": 0.002832, + "grad_norm": 1.21875, + "learning_rate": 3.52e-05, + "loss": 0.1553, + "step": 177 + }, + { + "epoch": 0.002848, + "grad_norm": 1.2578125, + "learning_rate": 3.54e-05, + "loss": 0.1247, + "step": 178 + }, + { + "epoch": 0.002864, + "grad_norm": 1.578125, + "learning_rate": 3.56e-05, + "loss": 0.1442, + "step": 179 + }, + { + "epoch": 0.00288, + "grad_norm": 0.92578125, + "learning_rate": 3.58e-05, + "loss": 0.1703, + "step": 180 + }, + { + "epoch": 0.002896, + "grad_norm": 0.921875, + "learning_rate": 3.6e-05, + "loss": 0.1625, + "step": 181 + }, + { + "epoch": 0.002912, + "grad_norm": 0.91015625, + "learning_rate": 3.62e-05, + "loss": 0.1524, + "step": 182 + }, + { + "epoch": 0.002928, + "grad_norm": 1.34375, + "learning_rate": 3.6400000000000004e-05, + "loss": 0.1852, + "step": 183 + }, + { + "epoch": 0.002944, + "grad_norm": 1.2890625, + "learning_rate": 3.66e-05, + "loss": 0.171, + "step": 184 + }, + { + "epoch": 0.00296, + "grad_norm": 0.69140625, + "learning_rate": 3.68e-05, + "loss": 0.1577, + "step": 185 + }, + { + "epoch": 0.002976, + "grad_norm": 0.97265625, + "learning_rate": 3.7e-05, + "loss": 0.1775, + "step": 186 + }, + { + "epoch": 0.002992, + "grad_norm": 0.7578125, + "learning_rate": 3.72e-05, + "loss": 0.172, + "step": 187 + }, + { + "epoch": 0.003008, + "grad_norm": 0.9453125, + "learning_rate": 3.74e-05, + "loss": 0.1324, + "step": 188 + }, + { + "epoch": 0.003024, + "grad_norm": 1.0703125, + "learning_rate": 3.76e-05, + "loss": 0.1733, + "step": 189 + }, + { + "epoch": 0.00304, + "grad_norm": 1.2578125, + "learning_rate": 3.7800000000000004e-05, + "loss": 0.173, + "step": 190 + }, + { + "epoch": 0.003056, + "grad_norm": 0.6875, + "learning_rate": 3.8e-05, + "loss": 0.1535, + "step": 191 + }, + { + "epoch": 0.003072, + "grad_norm": 0.87109375, + "learning_rate": 3.82e-05, + "loss": 0.1517, + "step": 192 + }, + { + "epoch": 0.003088, + "grad_norm": 1.5703125, + "learning_rate": 3.8400000000000005e-05, + "loss": 0.1485, + "step": 193 + }, + { + "epoch": 0.003104, + "grad_norm": 0.78125, + "learning_rate": 3.86e-05, + "loss": 0.1405, + "step": 194 + }, + { + "epoch": 0.00312, + "grad_norm": 1.1015625, + "learning_rate": 3.88e-05, + "loss": 0.1749, + "step": 195 + }, + { + "epoch": 0.003136, + "grad_norm": 0.78515625, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.1806, + "step": 196 + }, + { + "epoch": 0.003152, + "grad_norm": 0.88671875, + "learning_rate": 3.9200000000000004e-05, + "loss": 0.1921, + "step": 197 + }, + { + "epoch": 0.003168, + "grad_norm": 0.76953125, + "learning_rate": 3.94e-05, + "loss": 0.1632, + "step": 198 + }, + { + "epoch": 0.003184, + "grad_norm": 0.97265625, + "learning_rate": 3.960000000000001e-05, + "loss": 0.1807, + "step": 199 + }, + { + "epoch": 0.0032, + "grad_norm": 1.703125, + "learning_rate": 3.9800000000000005e-05, + "loss": 0.162, + "step": 200 + }, + { + "epoch": 0.003216, + "grad_norm": 1.6328125, + "learning_rate": 4e-05, + "loss": 0.1903, + "step": 201 + }, + { + "epoch": 0.003232, + "grad_norm": 1.5390625, + "learning_rate": 4.02e-05, + "loss": 0.15, + "step": 202 + }, + { + "epoch": 0.003248, + "grad_norm": 1.0390625, + "learning_rate": 4.0400000000000006e-05, + "loss": 0.1587, + "step": 203 + }, + { + "epoch": 0.003264, + "grad_norm": 1.34375, + "learning_rate": 4.0600000000000004e-05, + "loss": 0.1641, + "step": 204 + }, + { + "epoch": 0.00328, + "grad_norm": 0.875, + "learning_rate": 4.08e-05, + "loss": 0.1442, + "step": 205 + }, + { + "epoch": 0.003296, + "grad_norm": 0.875, + "learning_rate": 4.1e-05, + "loss": 0.1871, + "step": 206 + }, + { + "epoch": 0.003312, + "grad_norm": 0.8984375, + "learning_rate": 4.12e-05, + "loss": 0.1665, + "step": 207 + }, + { + "epoch": 0.003328, + "grad_norm": 0.7109375, + "learning_rate": 4.14e-05, + "loss": 0.188, + "step": 208 + }, + { + "epoch": 0.003344, + "grad_norm": 1.2265625, + "learning_rate": 4.16e-05, + "loss": 0.1708, + "step": 209 + }, + { + "epoch": 0.00336, + "grad_norm": 1.7421875, + "learning_rate": 4.18e-05, + "loss": 0.1738, + "step": 210 + }, + { + "epoch": 0.003376, + "grad_norm": 0.71484375, + "learning_rate": 4.2e-05, + "loss": 0.1465, + "step": 211 + }, + { + "epoch": 0.003392, + "grad_norm": 1.1875, + "learning_rate": 4.22e-05, + "loss": 0.1777, + "step": 212 + }, + { + "epoch": 0.003408, + "grad_norm": 1.53125, + "learning_rate": 4.24e-05, + "loss": 0.1629, + "step": 213 + }, + { + "epoch": 0.003424, + "grad_norm": 0.87109375, + "learning_rate": 4.26e-05, + "loss": 0.1491, + "step": 214 + }, + { + "epoch": 0.00344, + "grad_norm": 1.328125, + "learning_rate": 4.2800000000000004e-05, + "loss": 0.1601, + "step": 215 + }, + { + "epoch": 0.003456, + "grad_norm": 0.98046875, + "learning_rate": 4.3e-05, + "loss": 0.157, + "step": 216 + }, + { + "epoch": 0.003472, + "grad_norm": 0.87109375, + "learning_rate": 4.32e-05, + "loss": 0.1577, + "step": 217 + }, + { + "epoch": 0.003488, + "grad_norm": 0.890625, + "learning_rate": 4.3400000000000005e-05, + "loss": 0.2011, + "step": 218 + }, + { + "epoch": 0.003504, + "grad_norm": 0.91796875, + "learning_rate": 4.36e-05, + "loss": 0.165, + "step": 219 + }, + { + "epoch": 0.00352, + "grad_norm": 1.1328125, + "learning_rate": 4.38e-05, + "loss": 0.1751, + "step": 220 + }, + { + "epoch": 0.003536, + "grad_norm": 1.03125, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.1521, + "step": 221 + }, + { + "epoch": 0.003552, + "grad_norm": 0.92578125, + "learning_rate": 4.4200000000000004e-05, + "loss": 0.1781, + "step": 222 + }, + { + "epoch": 0.003568, + "grad_norm": 1.0234375, + "learning_rate": 4.44e-05, + "loss": 0.1545, + "step": 223 + }, + { + "epoch": 0.003584, + "grad_norm": 1.3125, + "learning_rate": 4.46e-05, + "loss": 0.1892, + "step": 224 + }, + { + "epoch": 0.0036, + "grad_norm": 0.93359375, + "learning_rate": 4.4800000000000005e-05, + "loss": 0.157, + "step": 225 + }, + { + "epoch": 0.003616, + "grad_norm": 0.87890625, + "learning_rate": 4.5e-05, + "loss": 0.1149, + "step": 226 + }, + { + "epoch": 0.003632, + "grad_norm": 1.1015625, + "learning_rate": 4.52e-05, + "loss": 0.1543, + "step": 227 + }, + { + "epoch": 0.003648, + "grad_norm": 1.1953125, + "learning_rate": 4.5400000000000006e-05, + "loss": 0.1527, + "step": 228 + }, + { + "epoch": 0.003664, + "grad_norm": 1.0703125, + "learning_rate": 4.5600000000000004e-05, + "loss": 0.1731, + "step": 229 + }, + { + "epoch": 0.00368, + "grad_norm": 1.03125, + "learning_rate": 4.58e-05, + "loss": 0.181, + "step": 230 + }, + { + "epoch": 0.003696, + "grad_norm": 0.71484375, + "learning_rate": 4.600000000000001e-05, + "loss": 0.1346, + "step": 231 + }, + { + "epoch": 0.003712, + "grad_norm": 1.34375, + "learning_rate": 4.6200000000000005e-05, + "loss": 0.1624, + "step": 232 + }, + { + "epoch": 0.003728, + "grad_norm": 0.7734375, + "learning_rate": 4.64e-05, + "loss": 0.1739, + "step": 233 + }, + { + "epoch": 0.003744, + "grad_norm": 1.3828125, + "learning_rate": 4.660000000000001e-05, + "loss": 0.1737, + "step": 234 + }, + { + "epoch": 0.00376, + "grad_norm": 1.2578125, + "learning_rate": 4.6800000000000006e-05, + "loss": 0.1791, + "step": 235 + }, + { + "epoch": 0.003776, + "grad_norm": 1.515625, + "learning_rate": 4.7e-05, + "loss": 0.1629, + "step": 236 + }, + { + "epoch": 0.003792, + "grad_norm": 0.78515625, + "learning_rate": 4.72e-05, + "loss": 0.1376, + "step": 237 + }, + { + "epoch": 0.003808, + "grad_norm": 3.09375, + "learning_rate": 4.74e-05, + "loss": 0.1914, + "step": 238 + }, + { + "epoch": 0.003824, + "grad_norm": 1.171875, + "learning_rate": 4.76e-05, + "loss": 0.2158, + "step": 239 + }, + { + "epoch": 0.00384, + "grad_norm": 1.3984375, + "learning_rate": 4.78e-05, + "loss": 0.1639, + "step": 240 + }, + { + "epoch": 0.003856, + "grad_norm": 1.140625, + "learning_rate": 4.8e-05, + "loss": 0.1623, + "step": 241 + }, + { + "epoch": 0.003872, + "grad_norm": 1.0859375, + "learning_rate": 4.82e-05, + "loss": 0.182, + "step": 242 + }, + { + "epoch": 0.003888, + "grad_norm": 0.85546875, + "learning_rate": 4.8400000000000004e-05, + "loss": 0.1513, + "step": 243 + }, + { + "epoch": 0.003904, + "grad_norm": 0.7578125, + "learning_rate": 4.86e-05, + "loss": 0.1746, + "step": 244 + }, + { + "epoch": 0.00392, + "grad_norm": 0.66015625, + "learning_rate": 4.88e-05, + "loss": 0.1398, + "step": 245 + }, + { + "epoch": 0.003936, + "grad_norm": 0.7890625, + "learning_rate": 4.9e-05, + "loss": 0.161, + "step": 246 + }, + { + "epoch": 0.003952, + "grad_norm": 1.1015625, + "learning_rate": 4.92e-05, + "loss": 0.1681, + "step": 247 + }, + { + "epoch": 0.003968, + "grad_norm": 1.3984375, + "learning_rate": 4.94e-05, + "loss": 0.1586, + "step": 248 + }, + { + "epoch": 0.003984, + "grad_norm": 0.84375, + "learning_rate": 4.96e-05, + "loss": 0.1859, + "step": 249 + }, + { + "epoch": 0.004, + "grad_norm": 1.21875, + "learning_rate": 4.9800000000000004e-05, + "loss": 0.1903, + "step": 250 + }, + { + "epoch": 0.004016, + "grad_norm": 0.734375, + "learning_rate": 5e-05, + "loss": 0.1602, + "step": 251 + }, + { + "epoch": 0.004032, + "grad_norm": 1.21875, + "learning_rate": 5.02e-05, + "loss": 0.1552, + "step": 252 + }, + { + "epoch": 0.004048, + "grad_norm": 1.796875, + "learning_rate": 5.0400000000000005e-05, + "loss": 0.1569, + "step": 253 + }, + { + "epoch": 0.004064, + "grad_norm": 0.9921875, + "learning_rate": 5.0600000000000003e-05, + "loss": 0.1393, + "step": 254 + }, + { + "epoch": 0.00408, + "grad_norm": 1.1796875, + "learning_rate": 5.08e-05, + "loss": 0.1806, + "step": 255 + }, + { + "epoch": 0.004096, + "grad_norm": 1.4140625, + "learning_rate": 5.1000000000000006e-05, + "loss": 0.1881, + "step": 256 + }, + { + "epoch": 0.004112, + "grad_norm": 0.921875, + "learning_rate": 5.1200000000000004e-05, + "loss": 0.1707, + "step": 257 + }, + { + "epoch": 0.004128, + "grad_norm": 0.81640625, + "learning_rate": 5.14e-05, + "loss": 0.1334, + "step": 258 + }, + { + "epoch": 0.004144, + "grad_norm": 1.15625, + "learning_rate": 5.16e-05, + "loss": 0.1567, + "step": 259 + }, + { + "epoch": 0.00416, + "grad_norm": 0.92578125, + "learning_rate": 5.1800000000000005e-05, + "loss": 0.1968, + "step": 260 + }, + { + "epoch": 0.004176, + "grad_norm": 0.82421875, + "learning_rate": 5.2000000000000004e-05, + "loss": 0.1845, + "step": 261 + }, + { + "epoch": 0.004192, + "grad_norm": 1.4375, + "learning_rate": 5.22e-05, + "loss": 0.178, + "step": 262 + }, + { + "epoch": 0.004208, + "grad_norm": 1.359375, + "learning_rate": 5.2400000000000007e-05, + "loss": 0.1573, + "step": 263 + }, + { + "epoch": 0.004224, + "grad_norm": 1.203125, + "learning_rate": 5.2600000000000005e-05, + "loss": 0.199, + "step": 264 + }, + { + "epoch": 0.00424, + "grad_norm": 1.1875, + "learning_rate": 5.28e-05, + "loss": 0.1837, + "step": 265 + }, + { + "epoch": 0.004256, + "grad_norm": 1.1953125, + "learning_rate": 5.300000000000001e-05, + "loss": 0.1973, + "step": 266 + }, + { + "epoch": 0.004272, + "grad_norm": 1.0546875, + "learning_rate": 5.3200000000000006e-05, + "loss": 0.1716, + "step": 267 + }, + { + "epoch": 0.004288, + "grad_norm": 2.09375, + "learning_rate": 5.3400000000000004e-05, + "loss": 0.1815, + "step": 268 + }, + { + "epoch": 0.004304, + "grad_norm": 1.2578125, + "learning_rate": 5.360000000000001e-05, + "loss": 0.182, + "step": 269 + }, + { + "epoch": 0.00432, + "grad_norm": 1.390625, + "learning_rate": 5.380000000000001e-05, + "loss": 0.1986, + "step": 270 + }, + { + "epoch": 0.004336, + "grad_norm": 1.5, + "learning_rate": 5.4000000000000005e-05, + "loss": 0.1955, + "step": 271 + }, + { + "epoch": 0.004352, + "grad_norm": 1.4296875, + "learning_rate": 5.420000000000001e-05, + "loss": 0.1624, + "step": 272 + }, + { + "epoch": 0.004368, + "grad_norm": 1.0546875, + "learning_rate": 5.440000000000001e-05, + "loss": 0.1641, + "step": 273 + }, + { + "epoch": 0.004384, + "grad_norm": 13.25, + "learning_rate": 5.4600000000000006e-05, + "loss": 0.2348, + "step": 274 + }, + { + "epoch": 0.0044, + "grad_norm": 0.98046875, + "learning_rate": 5.4800000000000004e-05, + "loss": 0.195, + "step": 275 + }, + { + "epoch": 0.004416, + "grad_norm": 0.90625, + "learning_rate": 5.500000000000001e-05, + "loss": 0.1762, + "step": 276 + }, + { + "epoch": 0.004432, + "grad_norm": 0.79296875, + "learning_rate": 5.520000000000001e-05, + "loss": 0.191, + "step": 277 + }, + { + "epoch": 0.004448, + "grad_norm": 1.4765625, + "learning_rate": 5.5400000000000005e-05, + "loss": 0.1919, + "step": 278 + }, + { + "epoch": 0.004464, + "grad_norm": 1.0234375, + "learning_rate": 5.560000000000001e-05, + "loss": 0.1617, + "step": 279 + }, + { + "epoch": 0.00448, + "grad_norm": 1.0390625, + "learning_rate": 5.580000000000001e-05, + "loss": 0.1588, + "step": 280 + }, + { + "epoch": 0.004496, + "grad_norm": 0.8984375, + "learning_rate": 5.6000000000000006e-05, + "loss": 0.1669, + "step": 281 + }, + { + "epoch": 0.004512, + "grad_norm": 0.9140625, + "learning_rate": 5.620000000000001e-05, + "loss": 0.1729, + "step": 282 + }, + { + "epoch": 0.004528, + "grad_norm": 1.2890625, + "learning_rate": 5.6399999999999995e-05, + "loss": 0.1824, + "step": 283 + }, + { + "epoch": 0.004544, + "grad_norm": 1.0, + "learning_rate": 5.66e-05, + "loss": 0.15, + "step": 284 + }, + { + "epoch": 0.00456, + "grad_norm": 1.1328125, + "learning_rate": 5.68e-05, + "loss": 0.1749, + "step": 285 + }, + { + "epoch": 0.004576, + "grad_norm": 1.0078125, + "learning_rate": 5.6999999999999996e-05, + "loss": 0.1679, + "step": 286 + }, + { + "epoch": 0.004592, + "grad_norm": 1.265625, + "learning_rate": 5.72e-05, + "loss": 0.1455, + "step": 287 + }, + { + "epoch": 0.004608, + "grad_norm": 1.453125, + "learning_rate": 5.74e-05, + "loss": 0.2471, + "step": 288 + }, + { + "epoch": 0.004624, + "grad_norm": 1.1171875, + "learning_rate": 5.76e-05, + "loss": 0.218, + "step": 289 + }, + { + "epoch": 0.00464, + "grad_norm": 0.921875, + "learning_rate": 5.7799999999999995e-05, + "loss": 0.1709, + "step": 290 + }, + { + "epoch": 0.004656, + "grad_norm": 2.171875, + "learning_rate": 5.8e-05, + "loss": 0.1917, + "step": 291 + }, + { + "epoch": 0.004672, + "grad_norm": 1.0390625, + "learning_rate": 5.82e-05, + "loss": 0.1716, + "step": 292 + }, + { + "epoch": 0.004688, + "grad_norm": 1.2421875, + "learning_rate": 5.8399999999999997e-05, + "loss": 0.1675, + "step": 293 + }, + { + "epoch": 0.004704, + "grad_norm": 1.3125, + "learning_rate": 5.86e-05, + "loss": 0.1524, + "step": 294 + }, + { + "epoch": 0.00472, + "grad_norm": 0.56640625, + "learning_rate": 5.88e-05, + "loss": 0.1235, + "step": 295 + }, + { + "epoch": 0.004736, + "grad_norm": 1.6171875, + "learning_rate": 5.9e-05, + "loss": 0.22, + "step": 296 + }, + { + "epoch": 0.004752, + "grad_norm": 1.359375, + "learning_rate": 5.92e-05, + "loss": 0.1908, + "step": 297 + }, + { + "epoch": 0.004768, + "grad_norm": 1.3359375, + "learning_rate": 5.94e-05, + "loss": 0.1698, + "step": 298 + }, + { + "epoch": 0.004784, + "grad_norm": 1.8125, + "learning_rate": 5.96e-05, + "loss": 0.2789, + "step": 299 + }, + { + "epoch": 0.0048, + "grad_norm": 1.296875, + "learning_rate": 5.9800000000000003e-05, + "loss": 0.188, + "step": 300 + }, + { + "epoch": 0.004816, + "grad_norm": 1.1328125, + "learning_rate": 6e-05, + "loss": 0.1685, + "step": 301 + }, + { + "epoch": 0.004832, + "grad_norm": 1.3125, + "learning_rate": 6.02e-05, + "loss": 0.217, + "step": 302 + }, + { + "epoch": 0.004848, + "grad_norm": 1.34375, + "learning_rate": 6.04e-05, + "loss": 0.188, + "step": 303 + }, + { + "epoch": 0.004864, + "grad_norm": 0.96875, + "learning_rate": 6.06e-05, + "loss": 0.1696, + "step": 304 + }, + { + "epoch": 0.00488, + "grad_norm": 1.1953125, + "learning_rate": 6.08e-05, + "loss": 0.1866, + "step": 305 + }, + { + "epoch": 0.004896, + "grad_norm": 1.921875, + "learning_rate": 6.1e-05, + "loss": 0.2091, + "step": 306 + }, + { + "epoch": 0.004912, + "grad_norm": 1.6640625, + "learning_rate": 6.12e-05, + "loss": 0.2341, + "step": 307 + }, + { + "epoch": 0.004928, + "grad_norm": 1.171875, + "learning_rate": 6.14e-05, + "loss": 0.2002, + "step": 308 + }, + { + "epoch": 0.004944, + "grad_norm": 1.2890625, + "learning_rate": 6.16e-05, + "loss": 0.1621, + "step": 309 + }, + { + "epoch": 0.00496, + "grad_norm": 1.2265625, + "learning_rate": 6.18e-05, + "loss": 0.2535, + "step": 310 + }, + { + "epoch": 0.004976, + "grad_norm": 1.078125, + "learning_rate": 6.2e-05, + "loss": 0.1658, + "step": 311 + }, + { + "epoch": 0.004992, + "grad_norm": 1.1171875, + "learning_rate": 6.220000000000001e-05, + "loss": 0.1721, + "step": 312 + }, + { + "epoch": 0.005008, + "grad_norm": 1.3046875, + "learning_rate": 6.24e-05, + "loss": 0.1947, + "step": 313 + }, + { + "epoch": 0.005024, + "grad_norm": 1.765625, + "learning_rate": 6.26e-05, + "loss": 0.2242, + "step": 314 + }, + { + "epoch": 0.00504, + "grad_norm": 1.5, + "learning_rate": 6.280000000000001e-05, + "loss": 0.2143, + "step": 315 + }, + { + "epoch": 0.005056, + "grad_norm": 1.8984375, + "learning_rate": 6.3e-05, + "loss": 0.219, + "step": 316 + }, + { + "epoch": 0.005072, + "grad_norm": 1.15625, + "learning_rate": 6.32e-05, + "loss": 0.175, + "step": 317 + }, + { + "epoch": 0.005088, + "grad_norm": 1.4765625, + "learning_rate": 6.340000000000001e-05, + "loss": 0.2109, + "step": 318 + }, + { + "epoch": 0.005104, + "grad_norm": 1.5078125, + "learning_rate": 6.36e-05, + "loss": 0.2011, + "step": 319 + }, + { + "epoch": 0.00512, + "grad_norm": 1.234375, + "learning_rate": 6.38e-05, + "loss": 0.1508, + "step": 320 + }, + { + "epoch": 0.005136, + "grad_norm": 1.1953125, + "learning_rate": 6.400000000000001e-05, + "loss": 0.2092, + "step": 321 + }, + { + "epoch": 0.005152, + "grad_norm": 1.3828125, + "learning_rate": 6.42e-05, + "loss": 0.1644, + "step": 322 + }, + { + "epoch": 0.005168, + "grad_norm": 1.5546875, + "learning_rate": 6.440000000000001e-05, + "loss": 0.1869, + "step": 323 + }, + { + "epoch": 0.005184, + "grad_norm": 1.3828125, + "learning_rate": 6.460000000000001e-05, + "loss": 0.209, + "step": 324 + }, + { + "epoch": 0.0052, + "grad_norm": 1.078125, + "learning_rate": 6.48e-05, + "loss": 0.2072, + "step": 325 + }, + { + "epoch": 0.005216, + "grad_norm": 0.8359375, + "learning_rate": 6.500000000000001e-05, + "loss": 0.1649, + "step": 326 + }, + { + "epoch": 0.005232, + "grad_norm": 1.2421875, + "learning_rate": 6.52e-05, + "loss": 0.1741, + "step": 327 + }, + { + "epoch": 0.005248, + "grad_norm": 1.75, + "learning_rate": 6.54e-05, + "loss": 0.2492, + "step": 328 + }, + { + "epoch": 0.005264, + "grad_norm": 1.1953125, + "learning_rate": 6.560000000000001e-05, + "loss": 0.1856, + "step": 329 + }, + { + "epoch": 0.00528, + "grad_norm": 1.265625, + "learning_rate": 6.58e-05, + "loss": 0.1559, + "step": 330 + }, + { + "epoch": 0.005296, + "grad_norm": 1.7265625, + "learning_rate": 6.6e-05, + "loss": 0.1944, + "step": 331 + }, + { + "epoch": 0.005312, + "grad_norm": 2.34375, + "learning_rate": 6.620000000000001e-05, + "loss": 0.2133, + "step": 332 + }, + { + "epoch": 0.005328, + "grad_norm": 1.046875, + "learning_rate": 6.64e-05, + "loss": 0.1602, + "step": 333 + }, + { + "epoch": 0.005344, + "grad_norm": 3.875, + "learning_rate": 6.66e-05, + "loss": 0.2797, + "step": 334 + }, + { + "epoch": 0.00536, + "grad_norm": 1.578125, + "learning_rate": 6.680000000000001e-05, + "loss": 0.1907, + "step": 335 + }, + { + "epoch": 0.005376, + "grad_norm": 1.390625, + "learning_rate": 6.7e-05, + "loss": 0.17, + "step": 336 + }, + { + "epoch": 0.005392, + "grad_norm": 1.0234375, + "learning_rate": 6.720000000000001e-05, + "loss": 0.1652, + "step": 337 + }, + { + "epoch": 0.005408, + "grad_norm": 1.890625, + "learning_rate": 6.740000000000001e-05, + "loss": 0.1889, + "step": 338 + }, + { + "epoch": 0.005424, + "grad_norm": 1.2578125, + "learning_rate": 6.76e-05, + "loss": 0.1817, + "step": 339 + }, + { + "epoch": 0.00544, + "grad_norm": 1.140625, + "learning_rate": 6.780000000000001e-05, + "loss": 0.1477, + "step": 340 + }, + { + "epoch": 0.005456, + "grad_norm": 1.296875, + "learning_rate": 6.800000000000001e-05, + "loss": 0.2106, + "step": 341 + }, + { + "epoch": 0.005472, + "grad_norm": 1.6796875, + "learning_rate": 6.82e-05, + "loss": 0.206, + "step": 342 + }, + { + "epoch": 0.005488, + "grad_norm": 1.4140625, + "learning_rate": 6.840000000000001e-05, + "loss": 0.2199, + "step": 343 + }, + { + "epoch": 0.005504, + "grad_norm": 1.4921875, + "learning_rate": 6.860000000000001e-05, + "loss": 0.1613, + "step": 344 + }, + { + "epoch": 0.00552, + "grad_norm": 1.390625, + "learning_rate": 6.879999999999999e-05, + "loss": 0.2386, + "step": 345 + }, + { + "epoch": 0.005536, + "grad_norm": 1.375, + "learning_rate": 6.9e-05, + "loss": 0.2058, + "step": 346 + }, + { + "epoch": 0.005552, + "grad_norm": 3.84375, + "learning_rate": 6.92e-05, + "loss": 0.2339, + "step": 347 + }, + { + "epoch": 0.005568, + "grad_norm": 1.4765625, + "learning_rate": 6.939999999999999e-05, + "loss": 0.2376, + "step": 348 + }, + { + "epoch": 0.005584, + "grad_norm": 1.3671875, + "learning_rate": 6.96e-05, + "loss": 0.2087, + "step": 349 + }, + { + "epoch": 0.0056, + "grad_norm": 1.296875, + "learning_rate": 6.98e-05, + "loss": 0.1991, + "step": 350 + }, + { + "epoch": 0.005616, + "grad_norm": 1.1875, + "learning_rate": 7e-05, + "loss": 0.2139, + "step": 351 + }, + { + "epoch": 0.005632, + "grad_norm": 1.125, + "learning_rate": 7.02e-05, + "loss": 0.2344, + "step": 352 + }, + { + "epoch": 0.005648, + "grad_norm": 0.98046875, + "learning_rate": 7.04e-05, + "loss": 0.1728, + "step": 353 + }, + { + "epoch": 0.005664, + "grad_norm": 1.046875, + "learning_rate": 7.06e-05, + "loss": 0.2156, + "step": 354 + }, + { + "epoch": 0.00568, + "grad_norm": 1.96875, + "learning_rate": 7.08e-05, + "loss": 0.2952, + "step": 355 + }, + { + "epoch": 0.005696, + "grad_norm": 1.5234375, + "learning_rate": 7.1e-05, + "loss": 0.2214, + "step": 356 + }, + { + "epoch": 0.005712, + "grad_norm": 1.296875, + "learning_rate": 7.12e-05, + "loss": 0.193, + "step": 357 + }, + { + "epoch": 0.005728, + "grad_norm": 1.59375, + "learning_rate": 7.14e-05, + "loss": 0.2172, + "step": 358 + }, + { + "epoch": 0.005744, + "grad_norm": 1.375, + "learning_rate": 7.16e-05, + "loss": 0.1934, + "step": 359 + }, + { + "epoch": 0.00576, + "grad_norm": 1.015625, + "learning_rate": 7.18e-05, + "loss": 0.1801, + "step": 360 + }, + { + "epoch": 0.005776, + "grad_norm": 1.2265625, + "learning_rate": 7.2e-05, + "loss": 0.2162, + "step": 361 + }, + { + "epoch": 0.005792, + "grad_norm": 1.734375, + "learning_rate": 7.22e-05, + "loss": 0.2739, + "step": 362 + }, + { + "epoch": 0.005808, + "grad_norm": 1.25, + "learning_rate": 7.24e-05, + "loss": 0.208, + "step": 363 + }, + { + "epoch": 0.005824, + "grad_norm": 1.4375, + "learning_rate": 7.26e-05, + "loss": 0.24, + "step": 364 + }, + { + "epoch": 0.00584, + "grad_norm": 1.25, + "learning_rate": 7.280000000000001e-05, + "loss": 0.2156, + "step": 365 + }, + { + "epoch": 0.005856, + "grad_norm": 1.125, + "learning_rate": 7.3e-05, + "loss": 0.1786, + "step": 366 + }, + { + "epoch": 0.005872, + "grad_norm": 1.2265625, + "learning_rate": 7.32e-05, + "loss": 0.1968, + "step": 367 + }, + { + "epoch": 0.005888, + "grad_norm": 0.9375, + "learning_rate": 7.340000000000001e-05, + "loss": 0.1776, + "step": 368 + }, + { + "epoch": 0.005904, + "grad_norm": 2.0, + "learning_rate": 7.36e-05, + "loss": 0.248, + "step": 369 + }, + { + "epoch": 0.00592, + "grad_norm": 1.203125, + "learning_rate": 7.38e-05, + "loss": 0.206, + "step": 370 + }, + { + "epoch": 0.005936, + "grad_norm": 1.5859375, + "learning_rate": 7.4e-05, + "loss": 0.1962, + "step": 371 + }, + { + "epoch": 0.005952, + "grad_norm": 1.6171875, + "learning_rate": 7.42e-05, + "loss": 0.1737, + "step": 372 + }, + { + "epoch": 0.005968, + "grad_norm": 5.25, + "learning_rate": 7.44e-05, + "loss": 0.2248, + "step": 373 + }, + { + "epoch": 0.005984, + "grad_norm": 1.171875, + "learning_rate": 7.46e-05, + "loss": 0.1612, + "step": 374 + }, + { + "epoch": 0.006, + "grad_norm": 1.53125, + "learning_rate": 7.48e-05, + "loss": 0.1882, + "step": 375 + }, + { + "epoch": 0.006016, + "grad_norm": 1.375, + "learning_rate": 7.500000000000001e-05, + "loss": 0.2168, + "step": 376 + }, + { + "epoch": 0.006032, + "grad_norm": 3.390625, + "learning_rate": 7.52e-05, + "loss": 0.2457, + "step": 377 + }, + { + "epoch": 0.006048, + "grad_norm": 1.5546875, + "learning_rate": 7.54e-05, + "loss": 0.2265, + "step": 378 + }, + { + "epoch": 0.006064, + "grad_norm": 1.4921875, + "learning_rate": 7.560000000000001e-05, + "loss": 0.1963, + "step": 379 + }, + { + "epoch": 0.00608, + "grad_norm": 1.2421875, + "learning_rate": 7.58e-05, + "loss": 0.1976, + "step": 380 + }, + { + "epoch": 0.006096, + "grad_norm": 2.296875, + "learning_rate": 7.6e-05, + "loss": 0.2828, + "step": 381 + }, + { + "epoch": 0.006112, + "grad_norm": 1.7109375, + "learning_rate": 7.620000000000001e-05, + "loss": 0.1964, + "step": 382 + }, + { + "epoch": 0.006128, + "grad_norm": 1.4609375, + "learning_rate": 7.64e-05, + "loss": 0.2206, + "step": 383 + }, + { + "epoch": 0.006144, + "grad_norm": 4.59375, + "learning_rate": 7.66e-05, + "loss": 0.2297, + "step": 384 + }, + { + "epoch": 0.00616, + "grad_norm": 1.484375, + "learning_rate": 7.680000000000001e-05, + "loss": 0.2095, + "step": 385 + }, + { + "epoch": 0.006176, + "grad_norm": 2.140625, + "learning_rate": 7.7e-05, + "loss": 0.2175, + "step": 386 + }, + { + "epoch": 0.006192, + "grad_norm": 1.0703125, + "learning_rate": 7.72e-05, + "loss": 0.1973, + "step": 387 + }, + { + "epoch": 0.006208, + "grad_norm": 2.015625, + "learning_rate": 7.740000000000001e-05, + "loss": 0.2281, + "step": 388 + }, + { + "epoch": 0.006224, + "grad_norm": 2.3125, + "learning_rate": 7.76e-05, + "loss": 0.266, + "step": 389 + }, + { + "epoch": 0.00624, + "grad_norm": 1.7421875, + "learning_rate": 7.780000000000001e-05, + "loss": 0.2339, + "step": 390 + }, + { + "epoch": 0.006256, + "grad_norm": 1.578125, + "learning_rate": 7.800000000000001e-05, + "loss": 0.2287, + "step": 391 + }, + { + "epoch": 0.006272, + "grad_norm": 1.6640625, + "learning_rate": 7.82e-05, + "loss": 0.2325, + "step": 392 + }, + { + "epoch": 0.006288, + "grad_norm": 1.984375, + "learning_rate": 7.840000000000001e-05, + "loss": 0.2713, + "step": 393 + }, + { + "epoch": 0.006304, + "grad_norm": 1.3046875, + "learning_rate": 7.860000000000001e-05, + "loss": 0.1938, + "step": 394 + }, + { + "epoch": 0.00632, + "grad_norm": 1.96875, + "learning_rate": 7.88e-05, + "loss": 0.2365, + "step": 395 + }, + { + "epoch": 0.006336, + "grad_norm": 1.1796875, + "learning_rate": 7.900000000000001e-05, + "loss": 0.2007, + "step": 396 + }, + { + "epoch": 0.006352, + "grad_norm": 2.265625, + "learning_rate": 7.920000000000001e-05, + "loss": 0.1852, + "step": 397 + }, + { + "epoch": 0.006368, + "grad_norm": 1.65625, + "learning_rate": 7.94e-05, + "loss": 0.245, + "step": 398 + }, + { + "epoch": 0.006384, + "grad_norm": 1.4609375, + "learning_rate": 7.960000000000001e-05, + "loss": 0.2187, + "step": 399 + }, + { + "epoch": 0.0064, + "grad_norm": 1.359375, + "learning_rate": 7.98e-05, + "loss": 0.1844, + "step": 400 + }, + { + "epoch": 0.006416, + "grad_norm": 1.296875, + "learning_rate": 8e-05, + "loss": 0.238, + "step": 401 + }, + { + "epoch": 0.006432, + "grad_norm": 1.8984375, + "learning_rate": 8.020000000000001e-05, + "loss": 0.1917, + "step": 402 + }, + { + "epoch": 0.006448, + "grad_norm": 1.5390625, + "learning_rate": 8.04e-05, + "loss": 0.1864, + "step": 403 + }, + { + "epoch": 0.006464, + "grad_norm": 1.6015625, + "learning_rate": 8.060000000000001e-05, + "loss": 0.2462, + "step": 404 + }, + { + "epoch": 0.00648, + "grad_norm": 1.5625, + "learning_rate": 8.080000000000001e-05, + "loss": 0.2428, + "step": 405 + }, + { + "epoch": 0.006496, + "grad_norm": 3.0, + "learning_rate": 8.1e-05, + "loss": 0.2489, + "step": 406 + }, + { + "epoch": 0.006512, + "grad_norm": 1.1796875, + "learning_rate": 8.120000000000001e-05, + "loss": 0.2184, + "step": 407 + }, + { + "epoch": 0.006528, + "grad_norm": 1.25, + "learning_rate": 8.14e-05, + "loss": 0.2057, + "step": 408 + }, + { + "epoch": 0.006544, + "grad_norm": 1.125, + "learning_rate": 8.16e-05, + "loss": 0.167, + "step": 409 + }, + { + "epoch": 0.00656, + "grad_norm": 1.7421875, + "learning_rate": 8.18e-05, + "loss": 0.1764, + "step": 410 + }, + { + "epoch": 0.006576, + "grad_norm": 1.1328125, + "learning_rate": 8.2e-05, + "loss": 0.1765, + "step": 411 + }, + { + "epoch": 0.006592, + "grad_norm": 1.28125, + "learning_rate": 8.22e-05, + "loss": 0.2193, + "step": 412 + }, + { + "epoch": 0.006608, + "grad_norm": 1.3203125, + "learning_rate": 8.24e-05, + "loss": 0.2179, + "step": 413 + }, + { + "epoch": 0.006624, + "grad_norm": 1.53125, + "learning_rate": 8.26e-05, + "loss": 0.2155, + "step": 414 + }, + { + "epoch": 0.00664, + "grad_norm": 1.5, + "learning_rate": 8.28e-05, + "loss": 0.2436, + "step": 415 + }, + { + "epoch": 0.006656, + "grad_norm": 1.328125, + "learning_rate": 8.3e-05, + "loss": 0.2089, + "step": 416 + }, + { + "epoch": 0.006672, + "grad_norm": 1.828125, + "learning_rate": 8.32e-05, + "loss": 0.2113, + "step": 417 + }, + { + "epoch": 0.006688, + "grad_norm": 1.765625, + "learning_rate": 8.34e-05, + "loss": 0.2776, + "step": 418 + }, + { + "epoch": 0.006704, + "grad_norm": 1.4453125, + "learning_rate": 8.36e-05, + "loss": 0.2434, + "step": 419 + }, + { + "epoch": 0.00672, + "grad_norm": 1.34375, + "learning_rate": 8.38e-05, + "loss": 0.2021, + "step": 420 + }, + { + "epoch": 0.006736, + "grad_norm": 1.6171875, + "learning_rate": 8.4e-05, + "loss": 0.2034, + "step": 421 + }, + { + "epoch": 0.006752, + "grad_norm": 1.6171875, + "learning_rate": 8.42e-05, + "loss": 0.2494, + "step": 422 + }, + { + "epoch": 0.006768, + "grad_norm": 2.0, + "learning_rate": 8.44e-05, + "loss": 0.3055, + "step": 423 + }, + { + "epoch": 0.006784, + "grad_norm": 1.1015625, + "learning_rate": 8.46e-05, + "loss": 0.2416, + "step": 424 + }, + { + "epoch": 0.0068, + "grad_norm": 1.453125, + "learning_rate": 8.48e-05, + "loss": 0.2095, + "step": 425 + }, + { + "epoch": 0.006816, + "grad_norm": 1.1875, + "learning_rate": 8.5e-05, + "loss": 0.2085, + "step": 426 + }, + { + "epoch": 0.006832, + "grad_norm": 1.1953125, + "learning_rate": 8.52e-05, + "loss": 0.1958, + "step": 427 + }, + { + "epoch": 0.006848, + "grad_norm": 2.609375, + "learning_rate": 8.54e-05, + "loss": 0.2087, + "step": 428 + }, + { + "epoch": 0.006864, + "grad_norm": 1.3984375, + "learning_rate": 8.560000000000001e-05, + "loss": 0.2584, + "step": 429 + }, + { + "epoch": 0.00688, + "grad_norm": 1.6796875, + "learning_rate": 8.58e-05, + "loss": 0.239, + "step": 430 + }, + { + "epoch": 0.006896, + "grad_norm": 1.421875, + "learning_rate": 8.6e-05, + "loss": 0.1848, + "step": 431 + }, + { + "epoch": 0.006912, + "grad_norm": 2.109375, + "learning_rate": 8.620000000000001e-05, + "loss": 0.2342, + "step": 432 + }, + { + "epoch": 0.006928, + "grad_norm": 1.375, + "learning_rate": 8.64e-05, + "loss": 0.1857, + "step": 433 + }, + { + "epoch": 0.006944, + "grad_norm": 1.6484375, + "learning_rate": 8.66e-05, + "loss": 0.2663, + "step": 434 + }, + { + "epoch": 0.00696, + "grad_norm": 1.484375, + "learning_rate": 8.680000000000001e-05, + "loss": 0.272, + "step": 435 + }, + { + "epoch": 0.006976, + "grad_norm": 2.046875, + "learning_rate": 8.7e-05, + "loss": 0.2859, + "step": 436 + }, + { + "epoch": 0.006992, + "grad_norm": 1.7578125, + "learning_rate": 8.72e-05, + "loss": 0.2416, + "step": 437 + }, + { + "epoch": 0.007008, + "grad_norm": 1.328125, + "learning_rate": 8.740000000000001e-05, + "loss": 0.2192, + "step": 438 + }, + { + "epoch": 0.007024, + "grad_norm": 1.6796875, + "learning_rate": 8.76e-05, + "loss": 0.2133, + "step": 439 + }, + { + "epoch": 0.00704, + "grad_norm": 1.34375, + "learning_rate": 8.78e-05, + "loss": 0.2125, + "step": 440 + }, + { + "epoch": 0.007056, + "grad_norm": 1.328125, + "learning_rate": 8.800000000000001e-05, + "loss": 0.2122, + "step": 441 + }, + { + "epoch": 0.007072, + "grad_norm": 1.0703125, + "learning_rate": 8.82e-05, + "loss": 0.168, + "step": 442 + }, + { + "epoch": 0.007088, + "grad_norm": 1.484375, + "learning_rate": 8.840000000000001e-05, + "loss": 0.2246, + "step": 443 + }, + { + "epoch": 0.007104, + "grad_norm": 1.0859375, + "learning_rate": 8.86e-05, + "loss": 0.1966, + "step": 444 + }, + { + "epoch": 0.00712, + "grad_norm": 1.9765625, + "learning_rate": 8.88e-05, + "loss": 0.2148, + "step": 445 + }, + { + "epoch": 0.007136, + "grad_norm": 1.296875, + "learning_rate": 8.900000000000001e-05, + "loss": 0.2048, + "step": 446 + }, + { + "epoch": 0.007152, + "grad_norm": 1.859375, + "learning_rate": 8.92e-05, + "loss": 0.1985, + "step": 447 + }, + { + "epoch": 0.007168, + "grad_norm": 1.234375, + "learning_rate": 8.94e-05, + "loss": 0.1884, + "step": 448 + }, + { + "epoch": 0.007184, + "grad_norm": 1.5390625, + "learning_rate": 8.960000000000001e-05, + "loss": 0.2292, + "step": 449 + }, + { + "epoch": 0.0072, + "grad_norm": 1.796875, + "learning_rate": 8.98e-05, + "loss": 0.25, + "step": 450 + }, + { + "epoch": 0.007216, + "grad_norm": 1.4453125, + "learning_rate": 9e-05, + "loss": 0.2437, + "step": 451 + }, + { + "epoch": 0.007232, + "grad_norm": 2.125, + "learning_rate": 9.020000000000001e-05, + "loss": 0.2758, + "step": 452 + }, + { + "epoch": 0.007248, + "grad_norm": 1.625, + "learning_rate": 9.04e-05, + "loss": 0.2387, + "step": 453 + }, + { + "epoch": 0.007264, + "grad_norm": 2.140625, + "learning_rate": 9.06e-05, + "loss": 0.198, + "step": 454 + }, + { + "epoch": 0.00728, + "grad_norm": 1.1640625, + "learning_rate": 9.080000000000001e-05, + "loss": 0.2135, + "step": 455 + }, + { + "epoch": 0.007296, + "grad_norm": 1.015625, + "learning_rate": 9.1e-05, + "loss": 0.2291, + "step": 456 + }, + { + "epoch": 0.007312, + "grad_norm": 1.4296875, + "learning_rate": 9.120000000000001e-05, + "loss": 0.2706, + "step": 457 + }, + { + "epoch": 0.007328, + "grad_norm": 1.46875, + "learning_rate": 9.140000000000001e-05, + "loss": 0.211, + "step": 458 + }, + { + "epoch": 0.007344, + "grad_norm": 1.7421875, + "learning_rate": 9.16e-05, + "loss": 0.2634, + "step": 459 + }, + { + "epoch": 0.00736, + "grad_norm": 1.2578125, + "learning_rate": 9.180000000000001e-05, + "loss": 0.25, + "step": 460 + }, + { + "epoch": 0.007376, + "grad_norm": 1.5390625, + "learning_rate": 9.200000000000001e-05, + "loss": 0.2523, + "step": 461 + }, + { + "epoch": 0.007392, + "grad_norm": 1.21875, + "learning_rate": 9.22e-05, + "loss": 0.1944, + "step": 462 + }, + { + "epoch": 0.007408, + "grad_norm": 1.421875, + "learning_rate": 9.240000000000001e-05, + "loss": 0.2117, + "step": 463 + }, + { + "epoch": 0.007424, + "grad_norm": 1.2734375, + "learning_rate": 9.260000000000001e-05, + "loss": 0.2227, + "step": 464 + }, + { + "epoch": 0.00744, + "grad_norm": 1.390625, + "learning_rate": 9.28e-05, + "loss": 0.2362, + "step": 465 + }, + { + "epoch": 0.007456, + "grad_norm": 1.7578125, + "learning_rate": 9.300000000000001e-05, + "loss": 0.2246, + "step": 466 + }, + { + "epoch": 0.007472, + "grad_norm": 1.25, + "learning_rate": 9.320000000000002e-05, + "loss": 0.1741, + "step": 467 + }, + { + "epoch": 0.007488, + "grad_norm": 2.125, + "learning_rate": 9.340000000000001e-05, + "loss": 0.2058, + "step": 468 + }, + { + "epoch": 0.007504, + "grad_norm": 1.671875, + "learning_rate": 9.360000000000001e-05, + "loss": 0.2051, + "step": 469 + }, + { + "epoch": 0.00752, + "grad_norm": 1.109375, + "learning_rate": 9.38e-05, + "loss": 0.2042, + "step": 470 + }, + { + "epoch": 0.007536, + "grad_norm": 3.5, + "learning_rate": 9.4e-05, + "loss": 0.2497, + "step": 471 + }, + { + "epoch": 0.007552, + "grad_norm": 1.0703125, + "learning_rate": 9.42e-05, + "loss": 0.2009, + "step": 472 + }, + { + "epoch": 0.007568, + "grad_norm": 1.4296875, + "learning_rate": 9.44e-05, + "loss": 0.1929, + "step": 473 + }, + { + "epoch": 0.007584, + "grad_norm": 1.4609375, + "learning_rate": 9.46e-05, + "loss": 0.1984, + "step": 474 + }, + { + "epoch": 0.0076, + "grad_norm": 1.890625, + "learning_rate": 9.48e-05, + "loss": 0.2615, + "step": 475 + }, + { + "epoch": 0.007616, + "grad_norm": 2.625, + "learning_rate": 9.5e-05, + "loss": 0.2284, + "step": 476 + }, + { + "epoch": 0.007632, + "grad_norm": 1.5234375, + "learning_rate": 9.52e-05, + "loss": 0.2151, + "step": 477 + }, + { + "epoch": 0.007648, + "grad_norm": 2.203125, + "learning_rate": 9.54e-05, + "loss": 0.2091, + "step": 478 + }, + { + "epoch": 0.007664, + "grad_norm": 2.203125, + "learning_rate": 9.56e-05, + "loss": 0.2084, + "step": 479 + }, + { + "epoch": 0.00768, + "grad_norm": 1.3203125, + "learning_rate": 9.58e-05, + "loss": 0.2107, + "step": 480 + }, + { + "epoch": 0.007696, + "grad_norm": 1.390625, + "learning_rate": 9.6e-05, + "loss": 0.2514, + "step": 481 + }, + { + "epoch": 0.007712, + "grad_norm": 1.1328125, + "learning_rate": 9.620000000000001e-05, + "loss": 0.1967, + "step": 482 + }, + { + "epoch": 0.007728, + "grad_norm": 1.7890625, + "learning_rate": 9.64e-05, + "loss": 0.2129, + "step": 483 + }, + { + "epoch": 0.007744, + "grad_norm": 1.2890625, + "learning_rate": 9.66e-05, + "loss": 0.2275, + "step": 484 + }, + { + "epoch": 0.00776, + "grad_norm": 1.140625, + "learning_rate": 9.680000000000001e-05, + "loss": 0.1816, + "step": 485 + }, + { + "epoch": 0.007776, + "grad_norm": 1.296875, + "learning_rate": 9.7e-05, + "loss": 0.2074, + "step": 486 + }, + { + "epoch": 0.007792, + "grad_norm": 1.4765625, + "learning_rate": 9.72e-05, + "loss": 0.2636, + "step": 487 + }, + { + "epoch": 0.007808, + "grad_norm": 1.2265625, + "learning_rate": 9.74e-05, + "loss": 0.193, + "step": 488 + }, + { + "epoch": 0.007824, + "grad_norm": 1.546875, + "learning_rate": 9.76e-05, + "loss": 0.2516, + "step": 489 + }, + { + "epoch": 0.00784, + "grad_norm": 1.578125, + "learning_rate": 9.78e-05, + "loss": 0.2639, + "step": 490 + }, + { + "epoch": 0.007856, + "grad_norm": 1.796875, + "learning_rate": 9.8e-05, + "loss": 0.2206, + "step": 491 + }, + { + "epoch": 0.007872, + "grad_norm": 1.6953125, + "learning_rate": 9.82e-05, + "loss": 0.2616, + "step": 492 + }, + { + "epoch": 0.007888, + "grad_norm": 1.3984375, + "learning_rate": 9.84e-05, + "loss": 0.239, + "step": 493 + }, + { + "epoch": 0.007904, + "grad_norm": 1.265625, + "learning_rate": 9.86e-05, + "loss": 0.2132, + "step": 494 + }, + { + "epoch": 0.00792, + "grad_norm": 1.609375, + "learning_rate": 9.88e-05, + "loss": 0.2131, + "step": 495 + }, + { + "epoch": 0.007936, + "grad_norm": 1.6953125, + "learning_rate": 9.900000000000001e-05, + "loss": 0.2324, + "step": 496 + }, + { + "epoch": 0.007952, + "grad_norm": 3.640625, + "learning_rate": 9.92e-05, + "loss": 0.2457, + "step": 497 + }, + { + "epoch": 0.007968, + "grad_norm": 2.34375, + "learning_rate": 9.94e-05, + "loss": 0.2534, + "step": 498 + }, + { + "epoch": 0.007984, + "grad_norm": 2.453125, + "learning_rate": 9.960000000000001e-05, + "loss": 0.2008, + "step": 499 + }, + { + "epoch": 0.008, + "grad_norm": 1.28125, + "learning_rate": 9.98e-05, + "loss": 0.2296, + "step": 500 + }, + { + "epoch": 0.008016, + "grad_norm": 1.1640625, + "learning_rate": 0.0001, + "loss": 0.241, + "step": 501 + }, + { + "epoch": 0.008032, + "grad_norm": 1.5859375, + "learning_rate": 9.999838709677419e-05, + "loss": 0.1918, + "step": 502 + }, + { + "epoch": 0.008048, + "grad_norm": 1.8046875, + "learning_rate": 9.999677419354839e-05, + "loss": 0.2266, + "step": 503 + }, + { + "epoch": 0.008064, + "grad_norm": 1.1796875, + "learning_rate": 9.999516129032258e-05, + "loss": 0.2164, + "step": 504 + }, + { + "epoch": 0.00808, + "grad_norm": 1.28125, + "learning_rate": 9.999354838709678e-05, + "loss": 0.2183, + "step": 505 + }, + { + "epoch": 0.008096, + "grad_norm": 1.2421875, + "learning_rate": 9.999193548387096e-05, + "loss": 0.1946, + "step": 506 + }, + { + "epoch": 0.008112, + "grad_norm": 1.625, + "learning_rate": 9.999032258064516e-05, + "loss": 0.2632, + "step": 507 + }, + { + "epoch": 0.008128, + "grad_norm": 1.28125, + "learning_rate": 9.998870967741936e-05, + "loss": 0.2165, + "step": 508 + }, + { + "epoch": 0.008144, + "grad_norm": 1.171875, + "learning_rate": 9.998709677419356e-05, + "loss": 0.2158, + "step": 509 + }, + { + "epoch": 0.00816, + "grad_norm": 1.65625, + "learning_rate": 9.998548387096775e-05, + "loss": 0.199, + "step": 510 + }, + { + "epoch": 0.008176, + "grad_norm": 1.3046875, + "learning_rate": 9.998387096774195e-05, + "loss": 0.1898, + "step": 511 + }, + { + "epoch": 0.008192, + "grad_norm": 1.6875, + "learning_rate": 9.998225806451613e-05, + "loss": 0.2538, + "step": 512 + }, + { + "epoch": 0.008208, + "grad_norm": 1.1328125, + "learning_rate": 9.998064516129033e-05, + "loss": 0.1981, + "step": 513 + }, + { + "epoch": 0.008224, + "grad_norm": 1.5546875, + "learning_rate": 9.997903225806452e-05, + "loss": 0.2577, + "step": 514 + }, + { + "epoch": 0.00824, + "grad_norm": 1.3203125, + "learning_rate": 9.997741935483872e-05, + "loss": 0.215, + "step": 515 + }, + { + "epoch": 0.008256, + "grad_norm": 1.1875, + "learning_rate": 9.99758064516129e-05, + "loss": 0.2181, + "step": 516 + }, + { + "epoch": 0.008272, + "grad_norm": 2.71875, + "learning_rate": 9.997419354838709e-05, + "loss": 0.273, + "step": 517 + }, + { + "epoch": 0.008288, + "grad_norm": 1.3671875, + "learning_rate": 9.997258064516129e-05, + "loss": 0.2044, + "step": 518 + }, + { + "epoch": 0.008304, + "grad_norm": 1.2890625, + "learning_rate": 9.997096774193549e-05, + "loss": 0.2378, + "step": 519 + }, + { + "epoch": 0.00832, + "grad_norm": 1.3359375, + "learning_rate": 9.996935483870969e-05, + "loss": 0.2251, + "step": 520 + }, + { + "epoch": 0.008336, + "grad_norm": 1.171875, + "learning_rate": 9.996774193548388e-05, + "loss": 0.2481, + "step": 521 + }, + { + "epoch": 0.008352, + "grad_norm": 1.4453125, + "learning_rate": 9.996612903225808e-05, + "loss": 0.2398, + "step": 522 + }, + { + "epoch": 0.008368, + "grad_norm": 1.265625, + "learning_rate": 9.996451612903226e-05, + "loss": 0.2458, + "step": 523 + }, + { + "epoch": 0.008384, + "grad_norm": 2.078125, + "learning_rate": 9.996290322580646e-05, + "loss": 0.2507, + "step": 524 + }, + { + "epoch": 0.0084, + "grad_norm": 2.015625, + "learning_rate": 9.996129032258065e-05, + "loss": 0.257, + "step": 525 + }, + { + "epoch": 0.008416, + "grad_norm": 1.484375, + "learning_rate": 9.995967741935485e-05, + "loss": 0.2593, + "step": 526 + }, + { + "epoch": 0.008432, + "grad_norm": 1.53125, + "learning_rate": 9.995806451612903e-05, + "loss": 0.2966, + "step": 527 + }, + { + "epoch": 0.008448, + "grad_norm": 1.1328125, + "learning_rate": 9.995645161290323e-05, + "loss": 0.2117, + "step": 528 + }, + { + "epoch": 0.008464, + "grad_norm": 1.1640625, + "learning_rate": 9.995483870967742e-05, + "loss": 0.2072, + "step": 529 + }, + { + "epoch": 0.00848, + "grad_norm": 0.94140625, + "learning_rate": 9.995322580645162e-05, + "loss": 0.1685, + "step": 530 + }, + { + "epoch": 0.008496, + "grad_norm": 1.40625, + "learning_rate": 9.995161290322582e-05, + "loss": 0.2181, + "step": 531 + }, + { + "epoch": 0.008512, + "grad_norm": 1.703125, + "learning_rate": 9.995e-05, + "loss": 0.271, + "step": 532 + }, + { + "epoch": 0.008528, + "grad_norm": 1.3671875, + "learning_rate": 9.99483870967742e-05, + "loss": 0.2361, + "step": 533 + }, + { + "epoch": 0.008544, + "grad_norm": 1.4453125, + "learning_rate": 9.994677419354839e-05, + "loss": 0.2176, + "step": 534 + }, + { + "epoch": 0.00856, + "grad_norm": 1.4765625, + "learning_rate": 9.994516129032259e-05, + "loss": 0.2223, + "step": 535 + }, + { + "epoch": 0.008576, + "grad_norm": 1.375, + "learning_rate": 9.994354838709677e-05, + "loss": 0.2274, + "step": 536 + }, + { + "epoch": 0.008592, + "grad_norm": 1.1484375, + "learning_rate": 9.994193548387097e-05, + "loss": 0.2121, + "step": 537 + }, + { + "epoch": 0.008608, + "grad_norm": 1.8671875, + "learning_rate": 9.994032258064516e-05, + "loss": 0.2049, + "step": 538 + }, + { + "epoch": 0.008624, + "grad_norm": 1.3046875, + "learning_rate": 9.993870967741936e-05, + "loss": 0.205, + "step": 539 + }, + { + "epoch": 0.00864, + "grad_norm": 1.4296875, + "learning_rate": 9.993709677419355e-05, + "loss": 0.1844, + "step": 540 + }, + { + "epoch": 0.008656, + "grad_norm": 1.0703125, + "learning_rate": 9.993548387096775e-05, + "loss": 0.2195, + "step": 541 + }, + { + "epoch": 0.008672, + "grad_norm": 1.3125, + "learning_rate": 9.993387096774193e-05, + "loss": 0.2225, + "step": 542 + }, + { + "epoch": 0.008688, + "grad_norm": 1.203125, + "learning_rate": 9.993225806451613e-05, + "loss": 0.2459, + "step": 543 + }, + { + "epoch": 0.008704, + "grad_norm": 1.265625, + "learning_rate": 9.993064516129033e-05, + "loss": 0.22, + "step": 544 + }, + { + "epoch": 0.00872, + "grad_norm": 1.2109375, + "learning_rate": 9.992903225806453e-05, + "loss": 0.2087, + "step": 545 + }, + { + "epoch": 0.008736, + "grad_norm": 1.0234375, + "learning_rate": 9.992741935483872e-05, + "loss": 0.1843, + "step": 546 + }, + { + "epoch": 0.008752, + "grad_norm": 1.1796875, + "learning_rate": 9.992580645161292e-05, + "loss": 0.226, + "step": 547 + }, + { + "epoch": 0.008768, + "grad_norm": 1.40625, + "learning_rate": 9.99241935483871e-05, + "loss": 0.2528, + "step": 548 + }, + { + "epoch": 0.008784, + "grad_norm": 1.1484375, + "learning_rate": 9.992258064516129e-05, + "loss": 0.2189, + "step": 549 + }, + { + "epoch": 0.0088, + "grad_norm": 1.328125, + "learning_rate": 9.992096774193549e-05, + "loss": 0.2097, + "step": 550 + }, + { + "epoch": 0.008816, + "grad_norm": 1.1796875, + "learning_rate": 9.991935483870967e-05, + "loss": 0.2091, + "step": 551 + }, + { + "epoch": 0.008832, + "grad_norm": 2.03125, + "learning_rate": 9.991774193548387e-05, + "loss": 0.2194, + "step": 552 + }, + { + "epoch": 0.008848, + "grad_norm": 0.9609375, + "learning_rate": 9.991612903225806e-05, + "loss": 0.2161, + "step": 553 + }, + { + "epoch": 0.008864, + "grad_norm": 1.2109375, + "learning_rate": 9.991451612903226e-05, + "loss": 0.2358, + "step": 554 + }, + { + "epoch": 0.00888, + "grad_norm": 1.8984375, + "learning_rate": 9.991290322580646e-05, + "loss": 0.2606, + "step": 555 + }, + { + "epoch": 0.008896, + "grad_norm": 1.90625, + "learning_rate": 9.991129032258066e-05, + "loss": 0.3322, + "step": 556 + }, + { + "epoch": 0.008912, + "grad_norm": 1.421875, + "learning_rate": 9.990967741935485e-05, + "loss": 0.2521, + "step": 557 + }, + { + "epoch": 0.008928, + "grad_norm": 2.09375, + "learning_rate": 9.990806451612904e-05, + "loss": 0.1948, + "step": 558 + }, + { + "epoch": 0.008944, + "grad_norm": 2.046875, + "learning_rate": 9.990645161290323e-05, + "loss": 0.2403, + "step": 559 + }, + { + "epoch": 0.00896, + "grad_norm": 1.2734375, + "learning_rate": 9.990483870967743e-05, + "loss": 0.2228, + "step": 560 + }, + { + "epoch": 0.008976, + "grad_norm": 1.625, + "learning_rate": 9.990322580645162e-05, + "loss": 0.2595, + "step": 561 + }, + { + "epoch": 0.008992, + "grad_norm": 6.9375, + "learning_rate": 9.990161290322582e-05, + "loss": 0.2991, + "step": 562 + }, + { + "epoch": 0.009008, + "grad_norm": 1.6171875, + "learning_rate": 9.99e-05, + "loss": 0.2297, + "step": 563 + }, + { + "epoch": 0.009024, + "grad_norm": 3.578125, + "learning_rate": 9.989838709677419e-05, + "loss": 0.2649, + "step": 564 + }, + { + "epoch": 0.00904, + "grad_norm": 3.046875, + "learning_rate": 9.989677419354839e-05, + "loss": 0.3551, + "step": 565 + }, + { + "epoch": 0.009056, + "grad_norm": 2.359375, + "learning_rate": 9.989516129032257e-05, + "loss": 0.2454, + "step": 566 + }, + { + "epoch": 0.009072, + "grad_norm": 1.0859375, + "learning_rate": 9.989354838709677e-05, + "loss": 0.1993, + "step": 567 + }, + { + "epoch": 0.009088, + "grad_norm": 1.0625, + "learning_rate": 9.989193548387097e-05, + "loss": 0.1984, + "step": 568 + }, + { + "epoch": 0.009104, + "grad_norm": 1.7421875, + "learning_rate": 9.989032258064517e-05, + "loss": 0.2268, + "step": 569 + }, + { + "epoch": 0.00912, + "grad_norm": 1.2109375, + "learning_rate": 9.988870967741936e-05, + "loss": 0.2191, + "step": 570 + }, + { + "epoch": 0.009136, + "grad_norm": 1.109375, + "learning_rate": 9.988709677419356e-05, + "loss": 0.2279, + "step": 571 + }, + { + "epoch": 0.009152, + "grad_norm": 2.5625, + "learning_rate": 9.988548387096774e-05, + "loss": 0.2642, + "step": 572 + }, + { + "epoch": 0.009168, + "grad_norm": 1.2890625, + "learning_rate": 9.988387096774194e-05, + "loss": 0.2063, + "step": 573 + }, + { + "epoch": 0.009184, + "grad_norm": 1.34375, + "learning_rate": 9.988225806451613e-05, + "loss": 0.2621, + "step": 574 + }, + { + "epoch": 0.0092, + "grad_norm": 1.578125, + "learning_rate": 9.988064516129033e-05, + "loss": 0.2461, + "step": 575 + }, + { + "epoch": 0.009216, + "grad_norm": 1.8203125, + "learning_rate": 9.987903225806452e-05, + "loss": 0.2417, + "step": 576 + }, + { + "epoch": 0.009232, + "grad_norm": 2.296875, + "learning_rate": 9.987741935483872e-05, + "loss": 0.3172, + "step": 577 + }, + { + "epoch": 0.009248, + "grad_norm": 1.625, + "learning_rate": 9.98758064516129e-05, + "loss": 0.2045, + "step": 578 + }, + { + "epoch": 0.009264, + "grad_norm": 1.078125, + "learning_rate": 9.98741935483871e-05, + "loss": 0.2102, + "step": 579 + }, + { + "epoch": 0.00928, + "grad_norm": 1.6953125, + "learning_rate": 9.98725806451613e-05, + "loss": 0.2546, + "step": 580 + }, + { + "epoch": 0.009296, + "grad_norm": 1.2421875, + "learning_rate": 9.987096774193549e-05, + "loss": 0.2855, + "step": 581 + }, + { + "epoch": 0.009312, + "grad_norm": 0.91796875, + "learning_rate": 9.986935483870969e-05, + "loss": 0.1888, + "step": 582 + }, + { + "epoch": 0.009328, + "grad_norm": 1.59375, + "learning_rate": 9.986774193548387e-05, + "loss": 0.245, + "step": 583 + }, + { + "epoch": 0.009344, + "grad_norm": 1.0, + "learning_rate": 9.986612903225807e-05, + "loss": 0.1977, + "step": 584 + }, + { + "epoch": 0.00936, + "grad_norm": 1.2890625, + "learning_rate": 9.986451612903226e-05, + "loss": 0.239, + "step": 585 + }, + { + "epoch": 0.009376, + "grad_norm": 1.1953125, + "learning_rate": 9.986290322580646e-05, + "loss": 0.2344, + "step": 586 + }, + { + "epoch": 0.009392, + "grad_norm": 1.4765625, + "learning_rate": 9.986129032258064e-05, + "loss": 0.2706, + "step": 587 + }, + { + "epoch": 0.009408, + "grad_norm": 1.1953125, + "learning_rate": 9.985967741935484e-05, + "loss": 0.1928, + "step": 588 + }, + { + "epoch": 0.009424, + "grad_norm": 1.375, + "learning_rate": 9.985806451612903e-05, + "loss": 0.2447, + "step": 589 + }, + { + "epoch": 0.00944, + "grad_norm": 0.92578125, + "learning_rate": 9.985645161290323e-05, + "loss": 0.2096, + "step": 590 + }, + { + "epoch": 0.009456, + "grad_norm": 1.0390625, + "learning_rate": 9.985483870967743e-05, + "loss": 0.1623, + "step": 591 + }, + { + "epoch": 0.009472, + "grad_norm": 1.296875, + "learning_rate": 9.985322580645163e-05, + "loss": 0.2591, + "step": 592 + }, + { + "epoch": 0.009488, + "grad_norm": 1.21875, + "learning_rate": 9.985161290322582e-05, + "loss": 0.2334, + "step": 593 + }, + { + "epoch": 0.009504, + "grad_norm": 1.6015625, + "learning_rate": 9.985000000000001e-05, + "loss": 0.2411, + "step": 594 + }, + { + "epoch": 0.00952, + "grad_norm": 1.125, + "learning_rate": 9.98483870967742e-05, + "loss": 0.1777, + "step": 595 + }, + { + "epoch": 0.009536, + "grad_norm": 1.4140625, + "learning_rate": 9.984677419354839e-05, + "loss": 0.1899, + "step": 596 + }, + { + "epoch": 0.009552, + "grad_norm": 1.15625, + "learning_rate": 9.984516129032259e-05, + "loss": 0.2171, + "step": 597 + }, + { + "epoch": 0.009568, + "grad_norm": 2.296875, + "learning_rate": 9.984354838709677e-05, + "loss": 0.2364, + "step": 598 + }, + { + "epoch": 0.009584, + "grad_norm": 1.5390625, + "learning_rate": 9.984193548387097e-05, + "loss": 0.2572, + "step": 599 + }, + { + "epoch": 0.0096, + "grad_norm": 1.2109375, + "learning_rate": 9.984032258064516e-05, + "loss": 0.2171, + "step": 600 + }, + { + "epoch": 0.009616, + "grad_norm": 1.484375, + "learning_rate": 9.983870967741936e-05, + "loss": 0.1931, + "step": 601 + }, + { + "epoch": 0.009632, + "grad_norm": 1.2890625, + "learning_rate": 9.983709677419354e-05, + "loss": 0.1991, + "step": 602 + }, + { + "epoch": 0.009648, + "grad_norm": 1.3515625, + "learning_rate": 9.983548387096774e-05, + "loss": 0.2195, + "step": 603 + }, + { + "epoch": 0.009664, + "grad_norm": 1.8359375, + "learning_rate": 9.983387096774194e-05, + "loss": 0.1975, + "step": 604 + }, + { + "epoch": 0.00968, + "grad_norm": 1.625, + "learning_rate": 9.983225806451614e-05, + "loss": 0.2477, + "step": 605 + }, + { + "epoch": 0.009696, + "grad_norm": 1.9609375, + "learning_rate": 9.983064516129033e-05, + "loss": 0.239, + "step": 606 + }, + { + "epoch": 0.009712, + "grad_norm": 1.8671875, + "learning_rate": 9.982903225806453e-05, + "loss": 0.2338, + "step": 607 + }, + { + "epoch": 0.009728, + "grad_norm": 1.3359375, + "learning_rate": 9.982741935483871e-05, + "loss": 0.249, + "step": 608 + }, + { + "epoch": 0.009744, + "grad_norm": 1.203125, + "learning_rate": 9.982580645161291e-05, + "loss": 0.213, + "step": 609 + }, + { + "epoch": 0.00976, + "grad_norm": 1.171875, + "learning_rate": 9.98241935483871e-05, + "loss": 0.2249, + "step": 610 + }, + { + "epoch": 0.009776, + "grad_norm": 1.359375, + "learning_rate": 9.982258064516129e-05, + "loss": 0.2171, + "step": 611 + }, + { + "epoch": 0.009792, + "grad_norm": 1.40625, + "learning_rate": 9.982096774193549e-05, + "loss": 0.2302, + "step": 612 + }, + { + "epoch": 0.009808, + "grad_norm": 1.1484375, + "learning_rate": 9.981935483870967e-05, + "loss": 0.211, + "step": 613 + }, + { + "epoch": 0.009824, + "grad_norm": 1.3359375, + "learning_rate": 9.981774193548387e-05, + "loss": 0.2323, + "step": 614 + }, + { + "epoch": 0.00984, + "grad_norm": 1.1484375, + "learning_rate": 9.981612903225807e-05, + "loss": 0.2652, + "step": 615 + }, + { + "epoch": 0.009856, + "grad_norm": 1.3984375, + "learning_rate": 9.981451612903227e-05, + "loss": 0.2248, + "step": 616 + }, + { + "epoch": 0.009872, + "grad_norm": 1.1328125, + "learning_rate": 9.981290322580646e-05, + "loss": 0.2206, + "step": 617 + }, + { + "epoch": 0.009888, + "grad_norm": 0.9765625, + "learning_rate": 9.981129032258066e-05, + "loss": 0.2036, + "step": 618 + }, + { + "epoch": 0.009904, + "grad_norm": 1.5859375, + "learning_rate": 9.980967741935484e-05, + "loss": 0.2556, + "step": 619 + }, + { + "epoch": 0.00992, + "grad_norm": 2.15625, + "learning_rate": 9.980806451612904e-05, + "loss": 0.23, + "step": 620 + }, + { + "epoch": 0.009936, + "grad_norm": 0.93359375, + "learning_rate": 9.980645161290323e-05, + "loss": 0.1749, + "step": 621 + }, + { + "epoch": 0.009952, + "grad_norm": 0.91015625, + "learning_rate": 9.980483870967743e-05, + "loss": 0.2035, + "step": 622 + }, + { + "epoch": 0.009968, + "grad_norm": 1.078125, + "learning_rate": 9.980322580645161e-05, + "loss": 0.2004, + "step": 623 + }, + { + "epoch": 0.009984, + "grad_norm": 1.234375, + "learning_rate": 9.980161290322581e-05, + "loss": 0.2038, + "step": 624 + }, + { + "epoch": 0.01, + "grad_norm": 1.109375, + "learning_rate": 9.98e-05, + "loss": 0.2396, + "step": 625 + }, + { + "epoch": 0.010016, + "grad_norm": 1.1875, + "learning_rate": 9.97983870967742e-05, + "loss": 0.246, + "step": 626 + }, + { + "epoch": 0.010032, + "grad_norm": 1.0234375, + "learning_rate": 9.97967741935484e-05, + "loss": 0.2192, + "step": 627 + }, + { + "epoch": 0.010048, + "grad_norm": 1.0, + "learning_rate": 9.979516129032259e-05, + "loss": 0.1944, + "step": 628 + }, + { + "epoch": 0.010064, + "grad_norm": 1.65625, + "learning_rate": 9.979354838709678e-05, + "loss": 0.239, + "step": 629 + }, + { + "epoch": 0.01008, + "grad_norm": 1.2578125, + "learning_rate": 9.979193548387097e-05, + "loss": 0.2011, + "step": 630 + }, + { + "epoch": 0.010096, + "grad_norm": 1.0703125, + "learning_rate": 9.979032258064517e-05, + "loss": 0.2139, + "step": 631 + }, + { + "epoch": 0.010112, + "grad_norm": 1.421875, + "learning_rate": 9.978870967741936e-05, + "loss": 0.2209, + "step": 632 + }, + { + "epoch": 0.010128, + "grad_norm": 1.2734375, + "learning_rate": 9.978709677419356e-05, + "loss": 0.1927, + "step": 633 + }, + { + "epoch": 0.010144, + "grad_norm": 1.25, + "learning_rate": 9.978548387096774e-05, + "loss": 0.2266, + "step": 634 + }, + { + "epoch": 0.01016, + "grad_norm": 1.75, + "learning_rate": 9.978387096774194e-05, + "loss": 0.2366, + "step": 635 + }, + { + "epoch": 0.010176, + "grad_norm": 1.96875, + "learning_rate": 9.978225806451613e-05, + "loss": 0.2254, + "step": 636 + }, + { + "epoch": 0.010192, + "grad_norm": 1.3984375, + "learning_rate": 9.978064516129033e-05, + "loss": 0.181, + "step": 637 + }, + { + "epoch": 0.010208, + "grad_norm": 1.109375, + "learning_rate": 9.977903225806451e-05, + "loss": 0.2215, + "step": 638 + }, + { + "epoch": 0.010224, + "grad_norm": 1.4765625, + "learning_rate": 9.977741935483871e-05, + "loss": 0.2034, + "step": 639 + }, + { + "epoch": 0.01024, + "grad_norm": 1.8515625, + "learning_rate": 9.977580645161291e-05, + "loss": 0.218, + "step": 640 + }, + { + "epoch": 0.010256, + "grad_norm": 1.9921875, + "learning_rate": 9.977419354838711e-05, + "loss": 0.2423, + "step": 641 + }, + { + "epoch": 0.010272, + "grad_norm": 1.7265625, + "learning_rate": 9.97725806451613e-05, + "loss": 0.2492, + "step": 642 + }, + { + "epoch": 0.010288, + "grad_norm": 1.9765625, + "learning_rate": 9.977096774193548e-05, + "loss": 0.2564, + "step": 643 + }, + { + "epoch": 0.010304, + "grad_norm": 1.7578125, + "learning_rate": 9.976935483870968e-05, + "loss": 0.2402, + "step": 644 + }, + { + "epoch": 0.01032, + "grad_norm": 1.09375, + "learning_rate": 9.976774193548387e-05, + "loss": 0.2073, + "step": 645 + }, + { + "epoch": 0.010336, + "grad_norm": 1.9609375, + "learning_rate": 9.976612903225807e-05, + "loss": 0.2732, + "step": 646 + }, + { + "epoch": 0.010352, + "grad_norm": 1.578125, + "learning_rate": 9.976451612903226e-05, + "loss": 0.2381, + "step": 647 + }, + { + "epoch": 0.010368, + "grad_norm": 1.2890625, + "learning_rate": 9.976290322580646e-05, + "loss": 0.2117, + "step": 648 + }, + { + "epoch": 0.010384, + "grad_norm": 2.609375, + "learning_rate": 9.976129032258064e-05, + "loss": 0.1856, + "step": 649 + }, + { + "epoch": 0.0104, + "grad_norm": 2.0625, + "learning_rate": 9.975967741935484e-05, + "loss": 0.1958, + "step": 650 + }, + { + "epoch": 0.010416, + "grad_norm": 1.5390625, + "learning_rate": 9.975806451612904e-05, + "loss": 0.2336, + "step": 651 + }, + { + "epoch": 0.010432, + "grad_norm": 1.046875, + "learning_rate": 9.975645161290324e-05, + "loss": 0.1968, + "step": 652 + }, + { + "epoch": 0.010448, + "grad_norm": 1.265625, + "learning_rate": 9.975483870967743e-05, + "loss": 0.2473, + "step": 653 + }, + { + "epoch": 0.010464, + "grad_norm": 1.296875, + "learning_rate": 9.975322580645163e-05, + "loss": 0.2321, + "step": 654 + }, + { + "epoch": 0.01048, + "grad_norm": 1.796875, + "learning_rate": 9.975161290322581e-05, + "loss": 0.2386, + "step": 655 + }, + { + "epoch": 0.010496, + "grad_norm": 1.140625, + "learning_rate": 9.975000000000001e-05, + "loss": 0.2141, + "step": 656 + }, + { + "epoch": 0.010512, + "grad_norm": 2.0, + "learning_rate": 9.97483870967742e-05, + "loss": 0.1694, + "step": 657 + }, + { + "epoch": 0.010528, + "grad_norm": 1.3671875, + "learning_rate": 9.974677419354838e-05, + "loss": 0.2006, + "step": 658 + }, + { + "epoch": 0.010544, + "grad_norm": 1.0703125, + "learning_rate": 9.974516129032258e-05, + "loss": 0.163, + "step": 659 + }, + { + "epoch": 0.01056, + "grad_norm": 1.75, + "learning_rate": 9.974354838709677e-05, + "loss": 0.2819, + "step": 660 + }, + { + "epoch": 0.010576, + "grad_norm": 1.015625, + "learning_rate": 9.974193548387097e-05, + "loss": 0.204, + "step": 661 + }, + { + "epoch": 0.010592, + "grad_norm": 1.8515625, + "learning_rate": 9.974032258064516e-05, + "loss": 0.2165, + "step": 662 + }, + { + "epoch": 0.010608, + "grad_norm": 2.4375, + "learning_rate": 9.973870967741936e-05, + "loss": 0.2271, + "step": 663 + }, + { + "epoch": 0.010624, + "grad_norm": 1.78125, + "learning_rate": 9.973709677419356e-05, + "loss": 0.2063, + "step": 664 + }, + { + "epoch": 0.01064, + "grad_norm": 2.34375, + "learning_rate": 9.973548387096775e-05, + "loss": 0.2049, + "step": 665 + }, + { + "epoch": 0.010656, + "grad_norm": 1.0625, + "learning_rate": 9.973387096774194e-05, + "loss": 0.2072, + "step": 666 + }, + { + "epoch": 0.010672, + "grad_norm": 1.2578125, + "learning_rate": 9.973225806451614e-05, + "loss": 0.2217, + "step": 667 + }, + { + "epoch": 0.010688, + "grad_norm": 1.6328125, + "learning_rate": 9.973064516129033e-05, + "loss": 0.2347, + "step": 668 + }, + { + "epoch": 0.010704, + "grad_norm": 1.9296875, + "learning_rate": 9.972903225806453e-05, + "loss": 0.2551, + "step": 669 + }, + { + "epoch": 0.01072, + "grad_norm": 1.21875, + "learning_rate": 9.972741935483871e-05, + "loss": 0.197, + "step": 670 + }, + { + "epoch": 0.010736, + "grad_norm": 1.6328125, + "learning_rate": 9.972580645161291e-05, + "loss": 0.2067, + "step": 671 + }, + { + "epoch": 0.010752, + "grad_norm": 1.015625, + "learning_rate": 9.97241935483871e-05, + "loss": 0.1859, + "step": 672 + }, + { + "epoch": 0.010768, + "grad_norm": 2.390625, + "learning_rate": 9.972258064516128e-05, + "loss": 0.2165, + "step": 673 + }, + { + "epoch": 0.010784, + "grad_norm": 1.234375, + "learning_rate": 9.972096774193548e-05, + "loss": 0.2142, + "step": 674 + }, + { + "epoch": 0.0108, + "grad_norm": 1.1875, + "learning_rate": 9.971935483870968e-05, + "loss": 0.2233, + "step": 675 + }, + { + "epoch": 0.010816, + "grad_norm": 1.0546875, + "learning_rate": 9.971774193548388e-05, + "loss": 0.1952, + "step": 676 + }, + { + "epoch": 0.010832, + "grad_norm": 1.7890625, + "learning_rate": 9.971612903225807e-05, + "loss": 0.3098, + "step": 677 + }, + { + "epoch": 0.010848, + "grad_norm": 1.1328125, + "learning_rate": 9.971451612903227e-05, + "loss": 0.2258, + "step": 678 + }, + { + "epoch": 0.010864, + "grad_norm": 1.265625, + "learning_rate": 9.971290322580645e-05, + "loss": 0.2495, + "step": 679 + }, + { + "epoch": 0.01088, + "grad_norm": 1.1328125, + "learning_rate": 9.971129032258065e-05, + "loss": 0.2017, + "step": 680 + }, + { + "epoch": 0.010896, + "grad_norm": 1.328125, + "learning_rate": 9.970967741935484e-05, + "loss": 0.2053, + "step": 681 + }, + { + "epoch": 0.010912, + "grad_norm": 1.328125, + "learning_rate": 9.970806451612904e-05, + "loss": 0.1789, + "step": 682 + }, + { + "epoch": 0.010928, + "grad_norm": 1.1015625, + "learning_rate": 9.970645161290323e-05, + "loss": 0.2078, + "step": 683 + }, + { + "epoch": 0.010944, + "grad_norm": 1.2109375, + "learning_rate": 9.970483870967743e-05, + "loss": 0.2125, + "step": 684 + }, + { + "epoch": 0.01096, + "grad_norm": 0.9296875, + "learning_rate": 9.970322580645161e-05, + "loss": 0.2103, + "step": 685 + }, + { + "epoch": 0.010976, + "grad_norm": 1.3359375, + "learning_rate": 9.970161290322581e-05, + "loss": 0.244, + "step": 686 + }, + { + "epoch": 0.010992, + "grad_norm": 1.4296875, + "learning_rate": 9.970000000000001e-05, + "loss": 0.225, + "step": 687 + }, + { + "epoch": 0.011008, + "grad_norm": 1.3515625, + "learning_rate": 9.96983870967742e-05, + "loss": 0.2329, + "step": 688 + }, + { + "epoch": 0.011024, + "grad_norm": 1.65625, + "learning_rate": 9.96967741935484e-05, + "loss": 0.2012, + "step": 689 + }, + { + "epoch": 0.01104, + "grad_norm": 1.390625, + "learning_rate": 9.969516129032258e-05, + "loss": 0.2037, + "step": 690 + }, + { + "epoch": 0.011056, + "grad_norm": 1.8125, + "learning_rate": 9.969354838709678e-05, + "loss": 0.2204, + "step": 691 + }, + { + "epoch": 0.011072, + "grad_norm": 1.3046875, + "learning_rate": 9.969193548387097e-05, + "loss": 0.2104, + "step": 692 + }, + { + "epoch": 0.011088, + "grad_norm": 1.734375, + "learning_rate": 9.969032258064517e-05, + "loss": 0.1956, + "step": 693 + }, + { + "epoch": 0.011104, + "grad_norm": 1.2265625, + "learning_rate": 9.968870967741935e-05, + "loss": 0.2028, + "step": 694 + }, + { + "epoch": 0.01112, + "grad_norm": 1.21875, + "learning_rate": 9.968709677419355e-05, + "loss": 0.2225, + "step": 695 + }, + { + "epoch": 0.011136, + "grad_norm": 1.28125, + "learning_rate": 9.968548387096774e-05, + "loss": 0.2074, + "step": 696 + }, + { + "epoch": 0.011152, + "grad_norm": 1.7734375, + "learning_rate": 9.968387096774194e-05, + "loss": 0.2424, + "step": 697 + }, + { + "epoch": 0.011168, + "grad_norm": 1.6328125, + "learning_rate": 9.968225806451613e-05, + "loss": 0.2151, + "step": 698 + }, + { + "epoch": 0.011184, + "grad_norm": 1.171875, + "learning_rate": 9.968064516129033e-05, + "loss": 0.2348, + "step": 699 + }, + { + "epoch": 0.0112, + "grad_norm": 1.7265625, + "learning_rate": 9.967903225806452e-05, + "loss": 0.2842, + "step": 700 + }, + { + "epoch": 0.011216, + "grad_norm": 1.1171875, + "learning_rate": 9.967741935483872e-05, + "loss": 0.2215, + "step": 701 + }, + { + "epoch": 0.011232, + "grad_norm": 1.28125, + "learning_rate": 9.967580645161291e-05, + "loss": 0.2159, + "step": 702 + }, + { + "epoch": 0.011248, + "grad_norm": 1.546875, + "learning_rate": 9.967419354838711e-05, + "loss": 0.2333, + "step": 703 + }, + { + "epoch": 0.011264, + "grad_norm": 2.484375, + "learning_rate": 9.96725806451613e-05, + "loss": 0.2268, + "step": 704 + }, + { + "epoch": 0.01128, + "grad_norm": 2.703125, + "learning_rate": 9.967096774193548e-05, + "loss": 0.2241, + "step": 705 + }, + { + "epoch": 0.011296, + "grad_norm": 1.640625, + "learning_rate": 9.966935483870968e-05, + "loss": 0.2038, + "step": 706 + }, + { + "epoch": 0.011312, + "grad_norm": 1.6328125, + "learning_rate": 9.966774193548387e-05, + "loss": 0.2241, + "step": 707 + }, + { + "epoch": 0.011328, + "grad_norm": 1.2890625, + "learning_rate": 9.966612903225807e-05, + "loss": 0.197, + "step": 708 + }, + { + "epoch": 0.011344, + "grad_norm": 1.5078125, + "learning_rate": 9.966451612903225e-05, + "loss": 0.2804, + "step": 709 + }, + { + "epoch": 0.01136, + "grad_norm": 1.546875, + "learning_rate": 9.966290322580645e-05, + "loss": 0.1791, + "step": 710 + }, + { + "epoch": 0.011376, + "grad_norm": 1.1015625, + "learning_rate": 9.966129032258065e-05, + "loss": 0.1964, + "step": 711 + }, + { + "epoch": 0.011392, + "grad_norm": 1.8203125, + "learning_rate": 9.965967741935485e-05, + "loss": 0.1916, + "step": 712 + }, + { + "epoch": 0.011408, + "grad_norm": 1.5546875, + "learning_rate": 9.965806451612904e-05, + "loss": 0.1687, + "step": 713 + }, + { + "epoch": 0.011424, + "grad_norm": 1.3515625, + "learning_rate": 9.965645161290324e-05, + "loss": 0.1872, + "step": 714 + }, + { + "epoch": 0.01144, + "grad_norm": 1.1484375, + "learning_rate": 9.965483870967742e-05, + "loss": 0.152, + "step": 715 + }, + { + "epoch": 0.011456, + "grad_norm": 1.34375, + "learning_rate": 9.965322580645162e-05, + "loss": 0.2456, + "step": 716 + }, + { + "epoch": 0.011472, + "grad_norm": 1.140625, + "learning_rate": 9.965161290322581e-05, + "loss": 0.2287, + "step": 717 + }, + { + "epoch": 0.011488, + "grad_norm": 1.71875, + "learning_rate": 9.965000000000001e-05, + "loss": 0.2455, + "step": 718 + }, + { + "epoch": 0.011504, + "grad_norm": 1.2578125, + "learning_rate": 9.96483870967742e-05, + "loss": 0.1931, + "step": 719 + }, + { + "epoch": 0.01152, + "grad_norm": 1.46875, + "learning_rate": 9.964677419354838e-05, + "loss": 0.2275, + "step": 720 + }, + { + "epoch": 0.011536, + "grad_norm": 1.3984375, + "learning_rate": 9.964516129032258e-05, + "loss": 0.2642, + "step": 721 + }, + { + "epoch": 0.011552, + "grad_norm": 1.0546875, + "learning_rate": 9.964354838709678e-05, + "loss": 0.2104, + "step": 722 + }, + { + "epoch": 0.011568, + "grad_norm": 1.1640625, + "learning_rate": 9.964193548387097e-05, + "loss": 0.2519, + "step": 723 + }, + { + "epoch": 0.011584, + "grad_norm": 1.1484375, + "learning_rate": 9.964032258064517e-05, + "loss": 0.2009, + "step": 724 + }, + { + "epoch": 0.0116, + "grad_norm": 1.28125, + "learning_rate": 9.963870967741937e-05, + "loss": 0.2427, + "step": 725 + }, + { + "epoch": 0.011616, + "grad_norm": 1.578125, + "learning_rate": 9.963709677419355e-05, + "loss": 0.2292, + "step": 726 + }, + { + "epoch": 0.011632, + "grad_norm": 1.46875, + "learning_rate": 9.963548387096775e-05, + "loss": 0.2484, + "step": 727 + }, + { + "epoch": 0.011648, + "grad_norm": 1.40625, + "learning_rate": 9.963387096774194e-05, + "loss": 0.217, + "step": 728 + }, + { + "epoch": 0.011664, + "grad_norm": 1.7578125, + "learning_rate": 9.963225806451614e-05, + "loss": 0.2343, + "step": 729 + }, + { + "epoch": 0.01168, + "grad_norm": 1.734375, + "learning_rate": 9.963064516129032e-05, + "loss": 0.2305, + "step": 730 + }, + { + "epoch": 0.011696, + "grad_norm": 1.3828125, + "learning_rate": 9.962903225806452e-05, + "loss": 0.2008, + "step": 731 + }, + { + "epoch": 0.011712, + "grad_norm": 0.99609375, + "learning_rate": 9.962741935483871e-05, + "loss": 0.237, + "step": 732 + }, + { + "epoch": 0.011728, + "grad_norm": 1.203125, + "learning_rate": 9.962580645161291e-05, + "loss": 0.1682, + "step": 733 + }, + { + "epoch": 0.011744, + "grad_norm": 1.203125, + "learning_rate": 9.96241935483871e-05, + "loss": 0.2004, + "step": 734 + }, + { + "epoch": 0.01176, + "grad_norm": 1.296875, + "learning_rate": 9.96225806451613e-05, + "loss": 0.2307, + "step": 735 + }, + { + "epoch": 0.011776, + "grad_norm": 0.890625, + "learning_rate": 9.96209677419355e-05, + "loss": 0.2032, + "step": 736 + }, + { + "epoch": 0.011792, + "grad_norm": 2.9375, + "learning_rate": 9.961935483870968e-05, + "loss": 0.2416, + "step": 737 + }, + { + "epoch": 0.011808, + "grad_norm": 1.8359375, + "learning_rate": 9.961774193548388e-05, + "loss": 0.1944, + "step": 738 + }, + { + "epoch": 0.011824, + "grad_norm": 2.40625, + "learning_rate": 9.961612903225807e-05, + "loss": 0.1941, + "step": 739 + }, + { + "epoch": 0.01184, + "grad_norm": 1.5078125, + "learning_rate": 9.961451612903227e-05, + "loss": 0.2097, + "step": 740 + }, + { + "epoch": 0.011856, + "grad_norm": 1.203125, + "learning_rate": 9.961290322580645e-05, + "loss": 0.2272, + "step": 741 + }, + { + "epoch": 0.011872, + "grad_norm": 1.3515625, + "learning_rate": 9.961129032258065e-05, + "loss": 0.1824, + "step": 742 + }, + { + "epoch": 0.011888, + "grad_norm": 1.0546875, + "learning_rate": 9.960967741935484e-05, + "loss": 0.2028, + "step": 743 + }, + { + "epoch": 0.011904, + "grad_norm": 0.9296875, + "learning_rate": 9.960806451612904e-05, + "loss": 0.1955, + "step": 744 + }, + { + "epoch": 0.01192, + "grad_norm": 3.109375, + "learning_rate": 9.960645161290322e-05, + "loss": 0.2003, + "step": 745 + }, + { + "epoch": 0.011936, + "grad_norm": 1.1953125, + "learning_rate": 9.960483870967742e-05, + "loss": 0.2222, + "step": 746 + }, + { + "epoch": 0.011952, + "grad_norm": 5.375, + "learning_rate": 9.960322580645162e-05, + "loss": 0.2473, + "step": 747 + }, + { + "epoch": 0.011968, + "grad_norm": 1.296875, + "learning_rate": 9.960161290322582e-05, + "loss": 0.1969, + "step": 748 + }, + { + "epoch": 0.011984, + "grad_norm": 1.03125, + "learning_rate": 9.960000000000001e-05, + "loss": 0.2114, + "step": 749 + }, + { + "epoch": 0.012, + "grad_norm": 1.2578125, + "learning_rate": 9.95983870967742e-05, + "loss": 0.2058, + "step": 750 + }, + { + "epoch": 0.012016, + "grad_norm": 1.2265625, + "learning_rate": 9.95967741935484e-05, + "loss": 0.2034, + "step": 751 + }, + { + "epoch": 0.012032, + "grad_norm": 0.84375, + "learning_rate": 9.959516129032258e-05, + "loss": 0.1769, + "step": 752 + }, + { + "epoch": 0.012048, + "grad_norm": 1.1015625, + "learning_rate": 9.959354838709678e-05, + "loss": 0.2398, + "step": 753 + }, + { + "epoch": 0.012064, + "grad_norm": 1.6953125, + "learning_rate": 9.959193548387097e-05, + "loss": 0.2288, + "step": 754 + }, + { + "epoch": 0.01208, + "grad_norm": 0.9765625, + "learning_rate": 9.959032258064517e-05, + "loss": 0.1635, + "step": 755 + }, + { + "epoch": 0.012096, + "grad_norm": 1.21875, + "learning_rate": 9.958870967741935e-05, + "loss": 0.1848, + "step": 756 + }, + { + "epoch": 0.012112, + "grad_norm": 0.921875, + "learning_rate": 9.958709677419355e-05, + "loss": 0.1702, + "step": 757 + }, + { + "epoch": 0.012128, + "grad_norm": 1.5859375, + "learning_rate": 9.958548387096774e-05, + "loss": 0.2262, + "step": 758 + }, + { + "epoch": 0.012144, + "grad_norm": 1.28125, + "learning_rate": 9.958387096774194e-05, + "loss": 0.1678, + "step": 759 + }, + { + "epoch": 0.01216, + "grad_norm": 0.9921875, + "learning_rate": 9.958225806451614e-05, + "loss": 0.1731, + "step": 760 + }, + { + "epoch": 0.012176, + "grad_norm": 2.90625, + "learning_rate": 9.958064516129034e-05, + "loss": 0.2346, + "step": 761 + }, + { + "epoch": 0.012192, + "grad_norm": 1.0703125, + "learning_rate": 9.957903225806452e-05, + "loss": 0.2022, + "step": 762 + }, + { + "epoch": 0.012208, + "grad_norm": 1.0234375, + "learning_rate": 9.957741935483872e-05, + "loss": 0.2034, + "step": 763 + }, + { + "epoch": 0.012224, + "grad_norm": 1.21875, + "learning_rate": 9.957580645161291e-05, + "loss": 0.1924, + "step": 764 + }, + { + "epoch": 0.01224, + "grad_norm": 1.0390625, + "learning_rate": 9.957419354838711e-05, + "loss": 0.1805, + "step": 765 + }, + { + "epoch": 0.012256, + "grad_norm": 1.5546875, + "learning_rate": 9.95725806451613e-05, + "loss": 0.2055, + "step": 766 + }, + { + "epoch": 0.012272, + "grad_norm": 1.3359375, + "learning_rate": 9.957096774193548e-05, + "loss": 0.2685, + "step": 767 + }, + { + "epoch": 0.012288, + "grad_norm": 1.1640625, + "learning_rate": 9.956935483870968e-05, + "loss": 0.2385, + "step": 768 + }, + { + "epoch": 0.012304, + "grad_norm": 0.91015625, + "learning_rate": 9.956774193548387e-05, + "loss": 0.1792, + "step": 769 + }, + { + "epoch": 0.01232, + "grad_norm": 1.0859375, + "learning_rate": 9.956612903225807e-05, + "loss": 0.2171, + "step": 770 + }, + { + "epoch": 0.012336, + "grad_norm": 1.59375, + "learning_rate": 9.956451612903226e-05, + "loss": 0.2407, + "step": 771 + }, + { + "epoch": 0.012352, + "grad_norm": 1.9375, + "learning_rate": 9.956290322580646e-05, + "loss": 0.19, + "step": 772 + }, + { + "epoch": 0.012368, + "grad_norm": 1.078125, + "learning_rate": 9.956129032258065e-05, + "loss": 0.2377, + "step": 773 + }, + { + "epoch": 0.012384, + "grad_norm": 0.78125, + "learning_rate": 9.955967741935485e-05, + "loss": 0.1635, + "step": 774 + }, + { + "epoch": 0.0124, + "grad_norm": 1.1171875, + "learning_rate": 9.955806451612904e-05, + "loss": 0.2026, + "step": 775 + }, + { + "epoch": 0.012416, + "grad_norm": 0.8515625, + "learning_rate": 9.955645161290324e-05, + "loss": 0.2005, + "step": 776 + }, + { + "epoch": 0.012432, + "grad_norm": 1.3671875, + "learning_rate": 9.955483870967742e-05, + "loss": 0.1962, + "step": 777 + }, + { + "epoch": 0.012448, + "grad_norm": 1.125, + "learning_rate": 9.955322580645162e-05, + "loss": 0.2015, + "step": 778 + }, + { + "epoch": 0.012464, + "grad_norm": 1.1328125, + "learning_rate": 9.955161290322581e-05, + "loss": 0.1914, + "step": 779 + }, + { + "epoch": 0.01248, + "grad_norm": 1.4296875, + "learning_rate": 9.955000000000001e-05, + "loss": 0.2031, + "step": 780 + }, + { + "epoch": 0.012496, + "grad_norm": 1.53125, + "learning_rate": 9.95483870967742e-05, + "loss": 0.2236, + "step": 781 + }, + { + "epoch": 0.012512, + "grad_norm": 0.91015625, + "learning_rate": 9.954677419354839e-05, + "loss": 0.1848, + "step": 782 + }, + { + "epoch": 0.012528, + "grad_norm": 0.9921875, + "learning_rate": 9.954516129032259e-05, + "loss": 0.1812, + "step": 783 + }, + { + "epoch": 0.012544, + "grad_norm": 1.1875, + "learning_rate": 9.954354838709678e-05, + "loss": 0.2319, + "step": 784 + }, + { + "epoch": 0.01256, + "grad_norm": 1.1171875, + "learning_rate": 9.954193548387098e-05, + "loss": 0.2012, + "step": 785 + }, + { + "epoch": 0.012576, + "grad_norm": 1.0546875, + "learning_rate": 9.954032258064516e-05, + "loss": 0.192, + "step": 786 + }, + { + "epoch": 0.012592, + "grad_norm": 1.34375, + "learning_rate": 9.953870967741936e-05, + "loss": 0.2039, + "step": 787 + }, + { + "epoch": 0.012608, + "grad_norm": 1.1875, + "learning_rate": 9.953709677419355e-05, + "loss": 0.1609, + "step": 788 + }, + { + "epoch": 0.012624, + "grad_norm": 1.0703125, + "learning_rate": 9.953548387096775e-05, + "loss": 0.1954, + "step": 789 + }, + { + "epoch": 0.01264, + "grad_norm": 1.484375, + "learning_rate": 9.953387096774194e-05, + "loss": 0.223, + "step": 790 + }, + { + "epoch": 0.012656, + "grad_norm": 1.3671875, + "learning_rate": 9.953225806451614e-05, + "loss": 0.2148, + "step": 791 + }, + { + "epoch": 0.012672, + "grad_norm": 1.75, + "learning_rate": 9.953064516129032e-05, + "loss": 0.2116, + "step": 792 + }, + { + "epoch": 0.012688, + "grad_norm": 1.53125, + "learning_rate": 9.952903225806452e-05, + "loss": 0.2004, + "step": 793 + }, + { + "epoch": 0.012704, + "grad_norm": 1.2265625, + "learning_rate": 9.952741935483871e-05, + "loss": 0.2442, + "step": 794 + }, + { + "epoch": 0.01272, + "grad_norm": 0.984375, + "learning_rate": 9.952580645161291e-05, + "loss": 0.2269, + "step": 795 + }, + { + "epoch": 0.012736, + "grad_norm": 1.203125, + "learning_rate": 9.95241935483871e-05, + "loss": 0.1999, + "step": 796 + }, + { + "epoch": 0.012752, + "grad_norm": 1.2109375, + "learning_rate": 9.952258064516129e-05, + "loss": 0.1885, + "step": 797 + }, + { + "epoch": 0.012768, + "grad_norm": 0.984375, + "learning_rate": 9.952096774193549e-05, + "loss": 0.2183, + "step": 798 + }, + { + "epoch": 0.012784, + "grad_norm": 1.0703125, + "learning_rate": 9.951935483870968e-05, + "loss": 0.2101, + "step": 799 + }, + { + "epoch": 0.0128, + "grad_norm": 1.296875, + "learning_rate": 9.951774193548388e-05, + "loss": 0.2546, + "step": 800 + }, + { + "epoch": 0.012816, + "grad_norm": 1.3046875, + "learning_rate": 9.951612903225806e-05, + "loss": 0.2341, + "step": 801 + }, + { + "epoch": 0.012832, + "grad_norm": 0.890625, + "learning_rate": 9.951451612903226e-05, + "loss": 0.2112, + "step": 802 + }, + { + "epoch": 0.012848, + "grad_norm": 1.1953125, + "learning_rate": 9.951290322580645e-05, + "loss": 0.2143, + "step": 803 + }, + { + "epoch": 0.012864, + "grad_norm": 1.4296875, + "learning_rate": 9.951129032258065e-05, + "loss": 0.2114, + "step": 804 + }, + { + "epoch": 0.01288, + "grad_norm": 1.140625, + "learning_rate": 9.950967741935484e-05, + "loss": 0.1925, + "step": 805 + }, + { + "epoch": 0.012896, + "grad_norm": 1.375, + "learning_rate": 9.950806451612904e-05, + "loss": 0.2346, + "step": 806 + }, + { + "epoch": 0.012912, + "grad_norm": 1.015625, + "learning_rate": 9.950645161290323e-05, + "loss": 0.2254, + "step": 807 + }, + { + "epoch": 0.012928, + "grad_norm": 1.3359375, + "learning_rate": 9.950483870967743e-05, + "loss": 0.2141, + "step": 808 + }, + { + "epoch": 0.012944, + "grad_norm": 1.3046875, + "learning_rate": 9.950322580645162e-05, + "loss": 0.2551, + "step": 809 + }, + { + "epoch": 0.01296, + "grad_norm": 1.2421875, + "learning_rate": 9.950161290322582e-05, + "loss": 0.1671, + "step": 810 + }, + { + "epoch": 0.012976, + "grad_norm": 2.03125, + "learning_rate": 9.95e-05, + "loss": 0.2356, + "step": 811 + }, + { + "epoch": 0.012992, + "grad_norm": 0.9609375, + "learning_rate": 9.94983870967742e-05, + "loss": 0.21, + "step": 812 + }, + { + "epoch": 0.013008, + "grad_norm": 0.93359375, + "learning_rate": 9.949677419354839e-05, + "loss": 0.1723, + "step": 813 + }, + { + "epoch": 0.013024, + "grad_norm": 1.03125, + "learning_rate": 9.949516129032258e-05, + "loss": 0.1829, + "step": 814 + }, + { + "epoch": 0.01304, + "grad_norm": 1.1171875, + "learning_rate": 9.949354838709678e-05, + "loss": 0.2432, + "step": 815 + }, + { + "epoch": 0.013056, + "grad_norm": 0.8984375, + "learning_rate": 9.949193548387096e-05, + "loss": 0.2011, + "step": 816 + }, + { + "epoch": 0.013072, + "grad_norm": 0.7421875, + "learning_rate": 9.949032258064516e-05, + "loss": 0.173, + "step": 817 + }, + { + "epoch": 0.013088, + "grad_norm": 0.890625, + "learning_rate": 9.948870967741935e-05, + "loss": 0.1763, + "step": 818 + }, + { + "epoch": 0.013104, + "grad_norm": 1.21875, + "learning_rate": 9.948709677419355e-05, + "loss": 0.2085, + "step": 819 + }, + { + "epoch": 0.01312, + "grad_norm": 1.234375, + "learning_rate": 9.948548387096775e-05, + "loss": 0.1955, + "step": 820 + }, + { + "epoch": 0.013136, + "grad_norm": 1.3359375, + "learning_rate": 9.948387096774195e-05, + "loss": 0.2483, + "step": 821 + }, + { + "epoch": 0.013152, + "grad_norm": 1.3359375, + "learning_rate": 9.948225806451613e-05, + "loss": 0.2284, + "step": 822 + }, + { + "epoch": 0.013168, + "grad_norm": 1.4765625, + "learning_rate": 9.948064516129033e-05, + "loss": 0.2112, + "step": 823 + }, + { + "epoch": 0.013184, + "grad_norm": 1.4296875, + "learning_rate": 9.947903225806452e-05, + "loss": 0.2141, + "step": 824 + }, + { + "epoch": 0.0132, + "grad_norm": 1.546875, + "learning_rate": 9.947741935483872e-05, + "loss": 0.2904, + "step": 825 + }, + { + "epoch": 0.013216, + "grad_norm": 1.53125, + "learning_rate": 9.94758064516129e-05, + "loss": 0.2136, + "step": 826 + }, + { + "epoch": 0.013232, + "grad_norm": 1.046875, + "learning_rate": 9.94741935483871e-05, + "loss": 0.2146, + "step": 827 + }, + { + "epoch": 0.013248, + "grad_norm": 0.91015625, + "learning_rate": 9.947258064516129e-05, + "loss": 0.2263, + "step": 828 + }, + { + "epoch": 0.013264, + "grad_norm": 1.078125, + "learning_rate": 9.947096774193548e-05, + "loss": 0.2406, + "step": 829 + }, + { + "epoch": 0.01328, + "grad_norm": 0.8984375, + "learning_rate": 9.946935483870968e-05, + "loss": 0.2023, + "step": 830 + }, + { + "epoch": 0.013296, + "grad_norm": 1.21875, + "learning_rate": 9.946774193548388e-05, + "loss": 0.2262, + "step": 831 + }, + { + "epoch": 0.013312, + "grad_norm": 0.94921875, + "learning_rate": 9.946612903225808e-05, + "loss": 0.1889, + "step": 832 + }, + { + "epoch": 0.013328, + "grad_norm": 1.0859375, + "learning_rate": 9.946451612903226e-05, + "loss": 0.2241, + "step": 833 + }, + { + "epoch": 0.013344, + "grad_norm": 1.40625, + "learning_rate": 9.946290322580646e-05, + "loss": 0.2066, + "step": 834 + }, + { + "epoch": 0.01336, + "grad_norm": 1.1875, + "learning_rate": 9.946129032258065e-05, + "loss": 0.2462, + "step": 835 + }, + { + "epoch": 0.013376, + "grad_norm": 1.2109375, + "learning_rate": 9.945967741935485e-05, + "loss": 0.1734, + "step": 836 + }, + { + "epoch": 0.013392, + "grad_norm": 1.2421875, + "learning_rate": 9.945806451612903e-05, + "loss": 0.2366, + "step": 837 + }, + { + "epoch": 0.013408, + "grad_norm": 1.40625, + "learning_rate": 9.945645161290323e-05, + "loss": 0.2175, + "step": 838 + }, + { + "epoch": 0.013424, + "grad_norm": 1.4921875, + "learning_rate": 9.945483870967742e-05, + "loss": 0.217, + "step": 839 + }, + { + "epoch": 0.01344, + "grad_norm": 1.21875, + "learning_rate": 9.945322580645162e-05, + "loss": 0.2056, + "step": 840 + }, + { + "epoch": 0.013456, + "grad_norm": 0.85546875, + "learning_rate": 9.94516129032258e-05, + "loss": 0.1785, + "step": 841 + }, + { + "epoch": 0.013472, + "grad_norm": 1.2734375, + "learning_rate": 9.945e-05, + "loss": 0.2163, + "step": 842 + }, + { + "epoch": 0.013488, + "grad_norm": 1.6328125, + "learning_rate": 9.94483870967742e-05, + "loss": 0.2325, + "step": 843 + }, + { + "epoch": 0.013504, + "grad_norm": 1.09375, + "learning_rate": 9.944677419354839e-05, + "loss": 0.2077, + "step": 844 + }, + { + "epoch": 0.01352, + "grad_norm": 1.3046875, + "learning_rate": 9.944516129032259e-05, + "loss": 0.2435, + "step": 845 + }, + { + "epoch": 0.013536, + "grad_norm": 1.0390625, + "learning_rate": 9.944354838709678e-05, + "loss": 0.1962, + "step": 846 + }, + { + "epoch": 0.013552, + "grad_norm": 1.1171875, + "learning_rate": 9.944193548387098e-05, + "loss": 0.2145, + "step": 847 + }, + { + "epoch": 0.013568, + "grad_norm": 1.46875, + "learning_rate": 9.944032258064516e-05, + "loss": 0.201, + "step": 848 + }, + { + "epoch": 0.013584, + "grad_norm": 1.28125, + "learning_rate": 9.943870967741936e-05, + "loss": 0.19, + "step": 849 + }, + { + "epoch": 0.0136, + "grad_norm": 1.3828125, + "learning_rate": 9.943709677419355e-05, + "loss": 0.2108, + "step": 850 + }, + { + "epoch": 0.013616, + "grad_norm": 0.890625, + "learning_rate": 9.943548387096775e-05, + "loss": 0.1802, + "step": 851 + }, + { + "epoch": 0.013632, + "grad_norm": 1.484375, + "learning_rate": 9.943387096774193e-05, + "loss": 0.2137, + "step": 852 + }, + { + "epoch": 0.013648, + "grad_norm": 1.3125, + "learning_rate": 9.943225806451613e-05, + "loss": 0.203, + "step": 853 + }, + { + "epoch": 0.013664, + "grad_norm": 0.98046875, + "learning_rate": 9.943064516129032e-05, + "loss": 0.1695, + "step": 854 + }, + { + "epoch": 0.01368, + "grad_norm": 1.2578125, + "learning_rate": 9.942903225806452e-05, + "loss": 0.2387, + "step": 855 + }, + { + "epoch": 0.013696, + "grad_norm": 1.21875, + "learning_rate": 9.942741935483872e-05, + "loss": 0.1774, + "step": 856 + }, + { + "epoch": 0.013712, + "grad_norm": 1.1640625, + "learning_rate": 9.942580645161292e-05, + "loss": 0.2013, + "step": 857 + }, + { + "epoch": 0.013728, + "grad_norm": 1.1171875, + "learning_rate": 9.94241935483871e-05, + "loss": 0.2341, + "step": 858 + }, + { + "epoch": 0.013744, + "grad_norm": 1.234375, + "learning_rate": 9.942258064516129e-05, + "loss": 0.1948, + "step": 859 + }, + { + "epoch": 0.01376, + "grad_norm": 1.2421875, + "learning_rate": 9.942096774193549e-05, + "loss": 0.1863, + "step": 860 + }, + { + "epoch": 0.013776, + "grad_norm": 2.078125, + "learning_rate": 9.941935483870968e-05, + "loss": 0.2684, + "step": 861 + }, + { + "epoch": 0.013792, + "grad_norm": 1.4453125, + "learning_rate": 9.941774193548388e-05, + "loss": 0.2227, + "step": 862 + }, + { + "epoch": 0.013808, + "grad_norm": 1.0703125, + "learning_rate": 9.941612903225806e-05, + "loss": 0.2022, + "step": 863 + }, + { + "epoch": 0.013824, + "grad_norm": 1.515625, + "learning_rate": 9.941451612903226e-05, + "loss": 0.2286, + "step": 864 + }, + { + "epoch": 0.01384, + "grad_norm": 1.1640625, + "learning_rate": 9.941290322580645e-05, + "loss": 0.2393, + "step": 865 + }, + { + "epoch": 0.013856, + "grad_norm": 1.1328125, + "learning_rate": 9.941129032258065e-05, + "loss": 0.2288, + "step": 866 + }, + { + "epoch": 0.013872, + "grad_norm": 0.9453125, + "learning_rate": 9.940967741935485e-05, + "loss": 0.1901, + "step": 867 + }, + { + "epoch": 0.013888, + "grad_norm": 0.96875, + "learning_rate": 9.940806451612905e-05, + "loss": 0.1744, + "step": 868 + }, + { + "epoch": 0.013904, + "grad_norm": 1.6171875, + "learning_rate": 9.940645161290323e-05, + "loss": 0.2246, + "step": 869 + }, + { + "epoch": 0.01392, + "grad_norm": 2.234375, + "learning_rate": 9.940483870967743e-05, + "loss": 0.2137, + "step": 870 + }, + { + "epoch": 0.013936, + "grad_norm": 1.359375, + "learning_rate": 9.940322580645162e-05, + "loss": 0.27, + "step": 871 + }, + { + "epoch": 0.013952, + "grad_norm": 1.171875, + "learning_rate": 9.940161290322582e-05, + "loss": 0.1714, + "step": 872 + }, + { + "epoch": 0.013968, + "grad_norm": 0.92578125, + "learning_rate": 9.94e-05, + "loss": 0.183, + "step": 873 + }, + { + "epoch": 0.013984, + "grad_norm": 1.3515625, + "learning_rate": 9.93983870967742e-05, + "loss": 0.2285, + "step": 874 + }, + { + "epoch": 0.014, + "grad_norm": 1.1953125, + "learning_rate": 9.939677419354839e-05, + "loss": 0.2273, + "step": 875 + }, + { + "epoch": 0.014016, + "grad_norm": 1.1171875, + "learning_rate": 9.939516129032258e-05, + "loss": 0.2104, + "step": 876 + }, + { + "epoch": 0.014032, + "grad_norm": 1.421875, + "learning_rate": 9.939354838709678e-05, + "loss": 0.2322, + "step": 877 + }, + { + "epoch": 0.014048, + "grad_norm": 1.40625, + "learning_rate": 9.939193548387097e-05, + "loss": 0.2317, + "step": 878 + }, + { + "epoch": 0.014064, + "grad_norm": 1.46875, + "learning_rate": 9.939032258064517e-05, + "loss": 0.2283, + "step": 879 + }, + { + "epoch": 0.01408, + "grad_norm": 1.1796875, + "learning_rate": 9.938870967741936e-05, + "loss": 0.2393, + "step": 880 + }, + { + "epoch": 0.014096, + "grad_norm": 2.125, + "learning_rate": 9.938709677419356e-05, + "loss": 0.22, + "step": 881 + }, + { + "epoch": 0.014112, + "grad_norm": 1.7109375, + "learning_rate": 9.938548387096775e-05, + "loss": 0.2907, + "step": 882 + }, + { + "epoch": 0.014128, + "grad_norm": 1.2578125, + "learning_rate": 9.938387096774195e-05, + "loss": 0.1981, + "step": 883 + }, + { + "epoch": 0.014144, + "grad_norm": 1.4453125, + "learning_rate": 9.938225806451613e-05, + "loss": 0.2293, + "step": 884 + }, + { + "epoch": 0.01416, + "grad_norm": 1.0703125, + "learning_rate": 9.938064516129033e-05, + "loss": 0.2157, + "step": 885 + }, + { + "epoch": 0.014176, + "grad_norm": 0.86328125, + "learning_rate": 9.937903225806452e-05, + "loss": 0.183, + "step": 886 + }, + { + "epoch": 0.014192, + "grad_norm": 0.9765625, + "learning_rate": 9.937741935483872e-05, + "loss": 0.1932, + "step": 887 + }, + { + "epoch": 0.014208, + "grad_norm": 0.859375, + "learning_rate": 9.93758064516129e-05, + "loss": 0.2034, + "step": 888 + }, + { + "epoch": 0.014224, + "grad_norm": 0.83203125, + "learning_rate": 9.93741935483871e-05, + "loss": 0.17, + "step": 889 + }, + { + "epoch": 0.01424, + "grad_norm": 1.15625, + "learning_rate": 9.937258064516129e-05, + "loss": 0.1972, + "step": 890 + }, + { + "epoch": 0.014256, + "grad_norm": 1.765625, + "learning_rate": 9.937096774193549e-05, + "loss": 0.166, + "step": 891 + }, + { + "epoch": 0.014272, + "grad_norm": 1.2265625, + "learning_rate": 9.936935483870969e-05, + "loss": 0.2194, + "step": 892 + }, + { + "epoch": 0.014288, + "grad_norm": 1.453125, + "learning_rate": 9.936774193548387e-05, + "loss": 0.2623, + "step": 893 + }, + { + "epoch": 0.014304, + "grad_norm": 0.9140625, + "learning_rate": 9.936612903225807e-05, + "loss": 0.1837, + "step": 894 + }, + { + "epoch": 0.01432, + "grad_norm": 1.3828125, + "learning_rate": 9.936451612903226e-05, + "loss": 0.2214, + "step": 895 + }, + { + "epoch": 0.014336, + "grad_norm": 1.0078125, + "learning_rate": 9.936290322580646e-05, + "loss": 0.1982, + "step": 896 + }, + { + "epoch": 0.014352, + "grad_norm": 1.5, + "learning_rate": 9.936129032258065e-05, + "loss": 0.2465, + "step": 897 + }, + { + "epoch": 0.014368, + "grad_norm": 0.984375, + "learning_rate": 9.935967741935485e-05, + "loss": 0.1675, + "step": 898 + }, + { + "epoch": 0.014384, + "grad_norm": 0.91015625, + "learning_rate": 9.935806451612903e-05, + "loss": 0.1721, + "step": 899 + }, + { + "epoch": 0.0144, + "grad_norm": 1.359375, + "learning_rate": 9.935645161290323e-05, + "loss": 0.2078, + "step": 900 + }, + { + "epoch": 0.014416, + "grad_norm": 1.125, + "learning_rate": 9.935483870967742e-05, + "loss": 0.1993, + "step": 901 + }, + { + "epoch": 0.014432, + "grad_norm": 1.078125, + "learning_rate": 9.935322580645162e-05, + "loss": 0.2175, + "step": 902 + }, + { + "epoch": 0.014448, + "grad_norm": 1.25, + "learning_rate": 9.935161290322582e-05, + "loss": 0.1971, + "step": 903 + }, + { + "epoch": 0.014464, + "grad_norm": 1.03125, + "learning_rate": 9.935000000000002e-05, + "loss": 0.2346, + "step": 904 + }, + { + "epoch": 0.01448, + "grad_norm": 0.92578125, + "learning_rate": 9.93483870967742e-05, + "loss": 0.17, + "step": 905 + }, + { + "epoch": 0.014496, + "grad_norm": 1.0703125, + "learning_rate": 9.934677419354839e-05, + "loss": 0.1834, + "step": 906 + }, + { + "epoch": 0.014512, + "grad_norm": 1.6171875, + "learning_rate": 9.934516129032259e-05, + "loss": 0.2397, + "step": 907 + }, + { + "epoch": 0.014528, + "grad_norm": 2.359375, + "learning_rate": 9.934354838709677e-05, + "loss": 0.1874, + "step": 908 + }, + { + "epoch": 0.014544, + "grad_norm": 1.5, + "learning_rate": 9.934193548387097e-05, + "loss": 0.198, + "step": 909 + }, + { + "epoch": 0.01456, + "grad_norm": 1.140625, + "learning_rate": 9.934032258064516e-05, + "loss": 0.2313, + "step": 910 + }, + { + "epoch": 0.014576, + "grad_norm": 0.87890625, + "learning_rate": 9.933870967741936e-05, + "loss": 0.2063, + "step": 911 + }, + { + "epoch": 0.014592, + "grad_norm": 1.0, + "learning_rate": 9.933709677419355e-05, + "loss": 0.2003, + "step": 912 + }, + { + "epoch": 0.014608, + "grad_norm": 1.0078125, + "learning_rate": 9.933548387096775e-05, + "loss": 0.1959, + "step": 913 + }, + { + "epoch": 0.014624, + "grad_norm": 1.0390625, + "learning_rate": 9.933387096774193e-05, + "loss": 0.2202, + "step": 914 + }, + { + "epoch": 0.01464, + "grad_norm": 0.9609375, + "learning_rate": 9.933225806451613e-05, + "loss": 0.2125, + "step": 915 + }, + { + "epoch": 0.014656, + "grad_norm": 1.1796875, + "learning_rate": 9.933064516129033e-05, + "loss": 0.2219, + "step": 916 + }, + { + "epoch": 0.014672, + "grad_norm": 0.93359375, + "learning_rate": 9.932903225806453e-05, + "loss": 0.2201, + "step": 917 + }, + { + "epoch": 0.014688, + "grad_norm": 1.046875, + "learning_rate": 9.932741935483872e-05, + "loss": 0.195, + "step": 918 + }, + { + "epoch": 0.014704, + "grad_norm": 1.1953125, + "learning_rate": 9.932580645161292e-05, + "loss": 0.2514, + "step": 919 + }, + { + "epoch": 0.01472, + "grad_norm": 0.890625, + "learning_rate": 9.93241935483871e-05, + "loss": 0.1727, + "step": 920 + }, + { + "epoch": 0.014736, + "grad_norm": 1.203125, + "learning_rate": 9.93225806451613e-05, + "loss": 0.248, + "step": 921 + }, + { + "epoch": 0.014752, + "grad_norm": 1.3515625, + "learning_rate": 9.932096774193549e-05, + "loss": 0.2325, + "step": 922 + }, + { + "epoch": 0.014768, + "grad_norm": 1.265625, + "learning_rate": 9.931935483870967e-05, + "loss": 0.2411, + "step": 923 + }, + { + "epoch": 0.014784, + "grad_norm": 0.9453125, + "learning_rate": 9.931774193548387e-05, + "loss": 0.2039, + "step": 924 + }, + { + "epoch": 0.0148, + "grad_norm": 0.87890625, + "learning_rate": 9.931612903225806e-05, + "loss": 0.1797, + "step": 925 + }, + { + "epoch": 0.014816, + "grad_norm": 1.1328125, + "learning_rate": 9.931451612903226e-05, + "loss": 0.1941, + "step": 926 + }, + { + "epoch": 0.014832, + "grad_norm": 0.89453125, + "learning_rate": 9.931290322580646e-05, + "loss": 0.1961, + "step": 927 + }, + { + "epoch": 0.014848, + "grad_norm": 1.1484375, + "learning_rate": 9.931129032258066e-05, + "loss": 0.2475, + "step": 928 + }, + { + "epoch": 0.014864, + "grad_norm": 1.1875, + "learning_rate": 9.930967741935484e-05, + "loss": 0.2143, + "step": 929 + }, + { + "epoch": 0.01488, + "grad_norm": 0.97265625, + "learning_rate": 9.930806451612904e-05, + "loss": 0.2077, + "step": 930 + }, + { + "epoch": 0.014896, + "grad_norm": 1.03125, + "learning_rate": 9.930645161290323e-05, + "loss": 0.2135, + "step": 931 + }, + { + "epoch": 0.014912, + "grad_norm": 1.140625, + "learning_rate": 9.930483870967743e-05, + "loss": 0.2256, + "step": 932 + }, + { + "epoch": 0.014928, + "grad_norm": 2.140625, + "learning_rate": 9.930322580645162e-05, + "loss": 0.2481, + "step": 933 + }, + { + "epoch": 0.014944, + "grad_norm": 1.421875, + "learning_rate": 9.930161290322582e-05, + "loss": 0.2516, + "step": 934 + }, + { + "epoch": 0.01496, + "grad_norm": 1.421875, + "learning_rate": 9.93e-05, + "loss": 0.2264, + "step": 935 + }, + { + "epoch": 0.014976, + "grad_norm": 1.2578125, + "learning_rate": 9.92983870967742e-05, + "loss": 0.2307, + "step": 936 + }, + { + "epoch": 0.014992, + "grad_norm": 1.171875, + "learning_rate": 9.929677419354839e-05, + "loss": 0.2463, + "step": 937 + }, + { + "epoch": 0.015008, + "grad_norm": 1.15625, + "learning_rate": 9.929516129032259e-05, + "loss": 0.2543, + "step": 938 + }, + { + "epoch": 0.015024, + "grad_norm": 1.28125, + "learning_rate": 9.929354838709679e-05, + "loss": 0.2362, + "step": 939 + }, + { + "epoch": 0.01504, + "grad_norm": 1.2578125, + "learning_rate": 9.929193548387097e-05, + "loss": 0.1823, + "step": 940 + }, + { + "epoch": 0.015056, + "grad_norm": 1.125, + "learning_rate": 9.929032258064517e-05, + "loss": 0.2382, + "step": 941 + }, + { + "epoch": 0.015072, + "grad_norm": 1.390625, + "learning_rate": 9.928870967741936e-05, + "loss": 0.2167, + "step": 942 + }, + { + "epoch": 0.015088, + "grad_norm": 1.25, + "learning_rate": 9.928709677419356e-05, + "loss": 0.1978, + "step": 943 + }, + { + "epoch": 0.015104, + "grad_norm": 1.3984375, + "learning_rate": 9.928548387096774e-05, + "loss": 0.2223, + "step": 944 + }, + { + "epoch": 0.01512, + "grad_norm": 1.171875, + "learning_rate": 9.928387096774194e-05, + "loss": 0.1772, + "step": 945 + }, + { + "epoch": 0.015136, + "grad_norm": 1.2890625, + "learning_rate": 9.928225806451613e-05, + "loss": 0.2323, + "step": 946 + }, + { + "epoch": 0.015152, + "grad_norm": 1.4765625, + "learning_rate": 9.928064516129033e-05, + "loss": 0.293, + "step": 947 + }, + { + "epoch": 0.015168, + "grad_norm": 0.96875, + "learning_rate": 9.927903225806452e-05, + "loss": 0.1987, + "step": 948 + }, + { + "epoch": 0.015184, + "grad_norm": 0.9921875, + "learning_rate": 9.927741935483871e-05, + "loss": 0.204, + "step": 949 + }, + { + "epoch": 0.0152, + "grad_norm": 1.0078125, + "learning_rate": 9.92758064516129e-05, + "loss": 0.1946, + "step": 950 + }, + { + "epoch": 0.015216, + "grad_norm": 1.5, + "learning_rate": 9.92741935483871e-05, + "loss": 0.1775, + "step": 951 + }, + { + "epoch": 0.015232, + "grad_norm": 1.3203125, + "learning_rate": 9.92725806451613e-05, + "loss": 0.2289, + "step": 952 + }, + { + "epoch": 0.015248, + "grad_norm": 1.75, + "learning_rate": 9.927096774193549e-05, + "loss": 0.194, + "step": 953 + }, + { + "epoch": 0.015264, + "grad_norm": 1.109375, + "learning_rate": 9.926935483870969e-05, + "loss": 0.1976, + "step": 954 + }, + { + "epoch": 0.01528, + "grad_norm": 1.5, + "learning_rate": 9.926774193548387e-05, + "loss": 0.1999, + "step": 955 + }, + { + "epoch": 0.015296, + "grad_norm": 1.5546875, + "learning_rate": 9.926612903225807e-05, + "loss": 0.2344, + "step": 956 + }, + { + "epoch": 0.015312, + "grad_norm": 1.265625, + "learning_rate": 9.926451612903226e-05, + "loss": 0.2161, + "step": 957 + }, + { + "epoch": 0.015328, + "grad_norm": 1.09375, + "learning_rate": 9.926290322580646e-05, + "loss": 0.2132, + "step": 958 + }, + { + "epoch": 0.015344, + "grad_norm": 1.453125, + "learning_rate": 9.926129032258064e-05, + "loss": 0.2381, + "step": 959 + }, + { + "epoch": 0.01536, + "grad_norm": 1.4921875, + "learning_rate": 9.925967741935484e-05, + "loss": 0.2242, + "step": 960 + }, + { + "epoch": 0.015376, + "grad_norm": 0.9140625, + "learning_rate": 9.925806451612903e-05, + "loss": 0.2199, + "step": 961 + }, + { + "epoch": 0.015392, + "grad_norm": 1.140625, + "learning_rate": 9.925645161290323e-05, + "loss": 0.2266, + "step": 962 + }, + { + "epoch": 0.015408, + "grad_norm": 1.53125, + "learning_rate": 9.925483870967743e-05, + "loss": 0.232, + "step": 963 + }, + { + "epoch": 0.015424, + "grad_norm": 1.3828125, + "learning_rate": 9.925322580645163e-05, + "loss": 0.223, + "step": 964 + }, + { + "epoch": 0.01544, + "grad_norm": 0.98046875, + "learning_rate": 9.925161290322581e-05, + "loss": 0.2572, + "step": 965 + }, + { + "epoch": 0.015456, + "grad_norm": 1.03125, + "learning_rate": 9.925000000000001e-05, + "loss": 0.2041, + "step": 966 + }, + { + "epoch": 0.015472, + "grad_norm": 1.0078125, + "learning_rate": 9.92483870967742e-05, + "loss": 0.1934, + "step": 967 + }, + { + "epoch": 0.015488, + "grad_norm": 1.3671875, + "learning_rate": 9.924677419354839e-05, + "loss": 0.2442, + "step": 968 + }, + { + "epoch": 0.015504, + "grad_norm": 1.9921875, + "learning_rate": 9.924516129032259e-05, + "loss": 0.2174, + "step": 969 + }, + { + "epoch": 0.01552, + "grad_norm": 0.93359375, + "learning_rate": 9.924354838709677e-05, + "loss": 0.1831, + "step": 970 + }, + { + "epoch": 0.015536, + "grad_norm": 1.5078125, + "learning_rate": 9.924193548387097e-05, + "loss": 0.2353, + "step": 971 + }, + { + "epoch": 0.015552, + "grad_norm": 0.84765625, + "learning_rate": 9.924032258064516e-05, + "loss": 0.2031, + "step": 972 + }, + { + "epoch": 0.015568, + "grad_norm": 1.734375, + "learning_rate": 9.923870967741936e-05, + "loss": 0.229, + "step": 973 + }, + { + "epoch": 0.015584, + "grad_norm": 1.40625, + "learning_rate": 9.923709677419356e-05, + "loss": 0.2247, + "step": 974 + }, + { + "epoch": 0.0156, + "grad_norm": 1.15625, + "learning_rate": 9.923548387096774e-05, + "loss": 0.2354, + "step": 975 + }, + { + "epoch": 0.015616, + "grad_norm": 0.82421875, + "learning_rate": 9.923387096774194e-05, + "loss": 0.1915, + "step": 976 + }, + { + "epoch": 0.015632, + "grad_norm": 1.5, + "learning_rate": 9.923225806451614e-05, + "loss": 0.2383, + "step": 977 + }, + { + "epoch": 0.015648, + "grad_norm": 1.1484375, + "learning_rate": 9.923064516129033e-05, + "loss": 0.2187, + "step": 978 + }, + { + "epoch": 0.015664, + "grad_norm": 1.15625, + "learning_rate": 9.922903225806453e-05, + "loss": 0.207, + "step": 979 + }, + { + "epoch": 0.01568, + "grad_norm": 1.8984375, + "learning_rate": 9.922741935483871e-05, + "loss": 0.2065, + "step": 980 + }, + { + "epoch": 0.015696, + "grad_norm": 0.91015625, + "learning_rate": 9.922580645161291e-05, + "loss": 0.21, + "step": 981 + }, + { + "epoch": 0.015712, + "grad_norm": 0.9765625, + "learning_rate": 9.92241935483871e-05, + "loss": 0.1415, + "step": 982 + }, + { + "epoch": 0.015728, + "grad_norm": 1.2265625, + "learning_rate": 9.92225806451613e-05, + "loss": 0.1859, + "step": 983 + }, + { + "epoch": 0.015744, + "grad_norm": 1.25, + "learning_rate": 9.922096774193549e-05, + "loss": 0.2135, + "step": 984 + }, + { + "epoch": 0.01576, + "grad_norm": 1.5546875, + "learning_rate": 9.921935483870967e-05, + "loss": 0.2106, + "step": 985 + }, + { + "epoch": 0.015776, + "grad_norm": 1.6328125, + "learning_rate": 9.921774193548387e-05, + "loss": 0.2612, + "step": 986 + }, + { + "epoch": 0.015792, + "grad_norm": 1.8515625, + "learning_rate": 9.921612903225807e-05, + "loss": 0.3134, + "step": 987 + }, + { + "epoch": 0.015808, + "grad_norm": 2.0625, + "learning_rate": 9.921451612903227e-05, + "loss": 0.2682, + "step": 988 + }, + { + "epoch": 0.015824, + "grad_norm": 1.1875, + "learning_rate": 9.921290322580646e-05, + "loss": 0.1606, + "step": 989 + }, + { + "epoch": 0.01584, + "grad_norm": 1.0859375, + "learning_rate": 9.921129032258066e-05, + "loss": 0.2534, + "step": 990 + }, + { + "epoch": 0.015856, + "grad_norm": 1.4921875, + "learning_rate": 9.920967741935484e-05, + "loss": 0.1944, + "step": 991 + }, + { + "epoch": 0.015872, + "grad_norm": 0.77734375, + "learning_rate": 9.920806451612904e-05, + "loss": 0.1889, + "step": 992 + }, + { + "epoch": 0.015888, + "grad_norm": 1.078125, + "learning_rate": 9.920645161290323e-05, + "loss": 0.2028, + "step": 993 + }, + { + "epoch": 0.015904, + "grad_norm": 1.0703125, + "learning_rate": 9.920483870967743e-05, + "loss": 0.2315, + "step": 994 + }, + { + "epoch": 0.01592, + "grad_norm": 1.4140625, + "learning_rate": 9.920322580645161e-05, + "loss": 0.242, + "step": 995 + }, + { + "epoch": 0.015936, + "grad_norm": 2.3125, + "learning_rate": 9.920161290322581e-05, + "loss": 0.2328, + "step": 996 + }, + { + "epoch": 0.015952, + "grad_norm": 1.4453125, + "learning_rate": 9.92e-05, + "loss": 0.2631, + "step": 997 + }, + { + "epoch": 0.015968, + "grad_norm": 1.4375, + "learning_rate": 9.91983870967742e-05, + "loss": 0.2386, + "step": 998 + }, + { + "epoch": 0.015984, + "grad_norm": 0.8515625, + "learning_rate": 9.91967741935484e-05, + "loss": 0.1959, + "step": 999 + }, + { + "epoch": 0.016, + "grad_norm": 1.1328125, + "learning_rate": 9.919516129032258e-05, + "loss": 0.1662, + "step": 1000 + }, + { + "epoch": 0.016016, + "grad_norm": 1.7265625, + "learning_rate": 9.919354838709678e-05, + "loss": 0.2002, + "step": 1001 + }, + { + "epoch": 0.016032, + "grad_norm": 1.2890625, + "learning_rate": 9.919193548387097e-05, + "loss": 0.207, + "step": 1002 + }, + { + "epoch": 0.016048, + "grad_norm": 1.09375, + "learning_rate": 9.919032258064517e-05, + "loss": 0.2162, + "step": 1003 + }, + { + "epoch": 0.016064, + "grad_norm": 0.99609375, + "learning_rate": 9.918870967741936e-05, + "loss": 0.1887, + "step": 1004 + }, + { + "epoch": 0.01608, + "grad_norm": 2.109375, + "learning_rate": 9.918709677419356e-05, + "loss": 0.2322, + "step": 1005 + }, + { + "epoch": 0.016096, + "grad_norm": 0.8984375, + "learning_rate": 9.918548387096774e-05, + "loss": 0.1847, + "step": 1006 + }, + { + "epoch": 0.016112, + "grad_norm": 1.7578125, + "learning_rate": 9.918387096774194e-05, + "loss": 0.2638, + "step": 1007 + }, + { + "epoch": 0.016128, + "grad_norm": 1.21875, + "learning_rate": 9.918225806451613e-05, + "loss": 0.2171, + "step": 1008 + }, + { + "epoch": 0.016144, + "grad_norm": 1.0546875, + "learning_rate": 9.918064516129033e-05, + "loss": 0.2025, + "step": 1009 + }, + { + "epoch": 0.01616, + "grad_norm": 1.0, + "learning_rate": 9.917903225806451e-05, + "loss": 0.2127, + "step": 1010 + }, + { + "epoch": 0.016176, + "grad_norm": 1.0625, + "learning_rate": 9.917741935483871e-05, + "loss": 0.2048, + "step": 1011 + }, + { + "epoch": 0.016192, + "grad_norm": 0.80078125, + "learning_rate": 9.917580645161291e-05, + "loss": 0.2043, + "step": 1012 + }, + { + "epoch": 0.016208, + "grad_norm": 1.2265625, + "learning_rate": 9.917419354838711e-05, + "loss": 0.2233, + "step": 1013 + }, + { + "epoch": 0.016224, + "grad_norm": 1.078125, + "learning_rate": 9.91725806451613e-05, + "loss": 0.2175, + "step": 1014 + }, + { + "epoch": 0.01624, + "grad_norm": 1.3515625, + "learning_rate": 9.917096774193548e-05, + "loss": 0.2313, + "step": 1015 + }, + { + "epoch": 0.016256, + "grad_norm": 1.5859375, + "learning_rate": 9.916935483870968e-05, + "loss": 0.2434, + "step": 1016 + }, + { + "epoch": 0.016272, + "grad_norm": 1.125, + "learning_rate": 9.916774193548387e-05, + "loss": 0.2499, + "step": 1017 + }, + { + "epoch": 0.016288, + "grad_norm": 1.96875, + "learning_rate": 9.916612903225807e-05, + "loss": 0.2834, + "step": 1018 + }, + { + "epoch": 0.016304, + "grad_norm": 1.0390625, + "learning_rate": 9.916451612903226e-05, + "loss": 0.1885, + "step": 1019 + }, + { + "epoch": 0.01632, + "grad_norm": 1.140625, + "learning_rate": 9.916290322580645e-05, + "loss": 0.2378, + "step": 1020 + }, + { + "epoch": 0.016336, + "grad_norm": 1.1015625, + "learning_rate": 9.916129032258064e-05, + "loss": 0.2285, + "step": 1021 + }, + { + "epoch": 0.016352, + "grad_norm": 1.0234375, + "learning_rate": 9.915967741935484e-05, + "loss": 0.2521, + "step": 1022 + }, + { + "epoch": 0.016368, + "grad_norm": 1.5, + "learning_rate": 9.915806451612904e-05, + "loss": 0.246, + "step": 1023 + }, + { + "epoch": 0.016384, + "grad_norm": 1.984375, + "learning_rate": 9.915645161290324e-05, + "loss": 0.1861, + "step": 1024 + }, + { + "epoch": 0.0164, + "grad_norm": 1.1875, + "learning_rate": 9.915483870967743e-05, + "loss": 0.2249, + "step": 1025 + }, + { + "epoch": 0.016416, + "grad_norm": 1.109375, + "learning_rate": 9.915322580645163e-05, + "loss": 0.2032, + "step": 1026 + }, + { + "epoch": 0.016432, + "grad_norm": 1.9296875, + "learning_rate": 9.915161290322581e-05, + "loss": 0.235, + "step": 1027 + }, + { + "epoch": 0.016448, + "grad_norm": 1.6875, + "learning_rate": 9.915000000000001e-05, + "loss": 0.2254, + "step": 1028 + }, + { + "epoch": 0.016464, + "grad_norm": 1.2265625, + "learning_rate": 9.91483870967742e-05, + "loss": 0.2247, + "step": 1029 + }, + { + "epoch": 0.01648, + "grad_norm": 1.859375, + "learning_rate": 9.91467741935484e-05, + "loss": 0.2385, + "step": 1030 + }, + { + "epoch": 0.016496, + "grad_norm": 1.15625, + "learning_rate": 9.914516129032258e-05, + "loss": 0.1808, + "step": 1031 + }, + { + "epoch": 0.016512, + "grad_norm": 0.85546875, + "learning_rate": 9.914354838709677e-05, + "loss": 0.1819, + "step": 1032 + }, + { + "epoch": 0.016528, + "grad_norm": 0.85546875, + "learning_rate": 9.914193548387097e-05, + "loss": 0.1744, + "step": 1033 + }, + { + "epoch": 0.016544, + "grad_norm": 0.84375, + "learning_rate": 9.914032258064517e-05, + "loss": 0.1631, + "step": 1034 + }, + { + "epoch": 0.01656, + "grad_norm": 0.72265625, + "learning_rate": 9.913870967741937e-05, + "loss": 0.1968, + "step": 1035 + }, + { + "epoch": 0.016576, + "grad_norm": 0.94921875, + "learning_rate": 9.913709677419355e-05, + "loss": 0.18, + "step": 1036 + }, + { + "epoch": 0.016592, + "grad_norm": 1.0078125, + "learning_rate": 9.913548387096775e-05, + "loss": 0.1678, + "step": 1037 + }, + { + "epoch": 0.016608, + "grad_norm": 0.93359375, + "learning_rate": 9.913387096774194e-05, + "loss": 0.2357, + "step": 1038 + }, + { + "epoch": 0.016624, + "grad_norm": 1.703125, + "learning_rate": 9.913225806451614e-05, + "loss": 0.1982, + "step": 1039 + }, + { + "epoch": 0.01664, + "grad_norm": 1.2109375, + "learning_rate": 9.913064516129033e-05, + "loss": 0.2487, + "step": 1040 + }, + { + "epoch": 0.016656, + "grad_norm": 1.046875, + "learning_rate": 9.912903225806453e-05, + "loss": 0.1986, + "step": 1041 + }, + { + "epoch": 0.016672, + "grad_norm": 1.171875, + "learning_rate": 9.912741935483871e-05, + "loss": 0.1869, + "step": 1042 + }, + { + "epoch": 0.016688, + "grad_norm": 1.09375, + "learning_rate": 9.912580645161291e-05, + "loss": 0.2055, + "step": 1043 + }, + { + "epoch": 0.016704, + "grad_norm": 1.0546875, + "learning_rate": 9.91241935483871e-05, + "loss": 0.201, + "step": 1044 + }, + { + "epoch": 0.01672, + "grad_norm": 1.1015625, + "learning_rate": 9.91225806451613e-05, + "loss": 0.2129, + "step": 1045 + }, + { + "epoch": 0.016736, + "grad_norm": 0.7734375, + "learning_rate": 9.912096774193548e-05, + "loss": 0.22, + "step": 1046 + }, + { + "epoch": 0.016752, + "grad_norm": 1.1171875, + "learning_rate": 9.911935483870968e-05, + "loss": 0.2436, + "step": 1047 + }, + { + "epoch": 0.016768, + "grad_norm": 1.0859375, + "learning_rate": 9.911774193548388e-05, + "loss": 0.2204, + "step": 1048 + }, + { + "epoch": 0.016784, + "grad_norm": 1.15625, + "learning_rate": 9.911612903225807e-05, + "loss": 0.2015, + "step": 1049 + }, + { + "epoch": 0.0168, + "grad_norm": 1.0234375, + "learning_rate": 9.911451612903227e-05, + "loss": 0.2226, + "step": 1050 + }, + { + "epoch": 0.016816, + "grad_norm": 1.140625, + "learning_rate": 9.911290322580645e-05, + "loss": 0.158, + "step": 1051 + }, + { + "epoch": 0.016832, + "grad_norm": 0.98828125, + "learning_rate": 9.911129032258065e-05, + "loss": 0.1885, + "step": 1052 + }, + { + "epoch": 0.016848, + "grad_norm": 1.515625, + "learning_rate": 9.910967741935484e-05, + "loss": 0.2088, + "step": 1053 + }, + { + "epoch": 0.016864, + "grad_norm": 1.390625, + "learning_rate": 9.910806451612904e-05, + "loss": 0.18, + "step": 1054 + }, + { + "epoch": 0.01688, + "grad_norm": 1.21875, + "learning_rate": 9.910645161290323e-05, + "loss": 0.1843, + "step": 1055 + }, + { + "epoch": 0.016896, + "grad_norm": 1.046875, + "learning_rate": 9.910483870967742e-05, + "loss": 0.2017, + "step": 1056 + }, + { + "epoch": 0.016912, + "grad_norm": 0.94921875, + "learning_rate": 9.910322580645161e-05, + "loss": 0.2178, + "step": 1057 + }, + { + "epoch": 0.016928, + "grad_norm": 1.28125, + "learning_rate": 9.910161290322581e-05, + "loss": 0.2007, + "step": 1058 + }, + { + "epoch": 0.016944, + "grad_norm": 0.96484375, + "learning_rate": 9.910000000000001e-05, + "loss": 0.2185, + "step": 1059 + }, + { + "epoch": 0.01696, + "grad_norm": 1.328125, + "learning_rate": 9.909838709677421e-05, + "loss": 0.216, + "step": 1060 + }, + { + "epoch": 0.016976, + "grad_norm": 0.98828125, + "learning_rate": 9.90967741935484e-05, + "loss": 0.2053, + "step": 1061 + }, + { + "epoch": 0.016992, + "grad_norm": 0.859375, + "learning_rate": 9.909516129032258e-05, + "loss": 0.1728, + "step": 1062 + }, + { + "epoch": 0.017008, + "grad_norm": 1.0390625, + "learning_rate": 9.909354838709678e-05, + "loss": 0.2058, + "step": 1063 + }, + { + "epoch": 0.017024, + "grad_norm": 1.03125, + "learning_rate": 9.909193548387097e-05, + "loss": 0.1749, + "step": 1064 + }, + { + "epoch": 0.01704, + "grad_norm": 1.203125, + "learning_rate": 9.909032258064517e-05, + "loss": 0.1672, + "step": 1065 + }, + { + "epoch": 0.017056, + "grad_norm": 1.3046875, + "learning_rate": 9.908870967741935e-05, + "loss": 0.2164, + "step": 1066 + }, + { + "epoch": 0.017072, + "grad_norm": 0.9609375, + "learning_rate": 9.908709677419355e-05, + "loss": 0.212, + "step": 1067 + }, + { + "epoch": 0.017088, + "grad_norm": 1.203125, + "learning_rate": 9.908548387096774e-05, + "loss": 0.2312, + "step": 1068 + }, + { + "epoch": 0.017104, + "grad_norm": 0.671875, + "learning_rate": 9.908387096774194e-05, + "loss": 0.1643, + "step": 1069 + }, + { + "epoch": 0.01712, + "grad_norm": 1.2109375, + "learning_rate": 9.908225806451612e-05, + "loss": 0.1843, + "step": 1070 + }, + { + "epoch": 0.017136, + "grad_norm": 1.640625, + "learning_rate": 9.908064516129032e-05, + "loss": 0.2651, + "step": 1071 + }, + { + "epoch": 0.017152, + "grad_norm": 0.90234375, + "learning_rate": 9.907903225806452e-05, + "loss": 0.1808, + "step": 1072 + }, + { + "epoch": 0.017168, + "grad_norm": 1.0703125, + "learning_rate": 9.907741935483872e-05, + "loss": 0.1896, + "step": 1073 + }, + { + "epoch": 0.017184, + "grad_norm": 0.9609375, + "learning_rate": 9.907580645161291e-05, + "loss": 0.1553, + "step": 1074 + }, + { + "epoch": 0.0172, + "grad_norm": 1.078125, + "learning_rate": 9.907419354838711e-05, + "loss": 0.208, + "step": 1075 + }, + { + "epoch": 0.017216, + "grad_norm": 1.1875, + "learning_rate": 9.90725806451613e-05, + "loss": 0.2051, + "step": 1076 + }, + { + "epoch": 0.017232, + "grad_norm": 1.03125, + "learning_rate": 9.907096774193548e-05, + "loss": 0.2097, + "step": 1077 + }, + { + "epoch": 0.017248, + "grad_norm": 1.1640625, + "learning_rate": 9.906935483870968e-05, + "loss": 0.2535, + "step": 1078 + }, + { + "epoch": 0.017264, + "grad_norm": 1.21875, + "learning_rate": 9.906774193548387e-05, + "loss": 0.2243, + "step": 1079 + }, + { + "epoch": 0.01728, + "grad_norm": 1.265625, + "learning_rate": 9.906612903225807e-05, + "loss": 0.2259, + "step": 1080 + }, + { + "epoch": 0.017296, + "grad_norm": 1.65625, + "learning_rate": 9.906451612903225e-05, + "loss": 0.207, + "step": 1081 + }, + { + "epoch": 0.017312, + "grad_norm": 1.09375, + "learning_rate": 9.906290322580645e-05, + "loss": 0.1751, + "step": 1082 + }, + { + "epoch": 0.017328, + "grad_norm": 1.21875, + "learning_rate": 9.906129032258065e-05, + "loss": 0.2246, + "step": 1083 + }, + { + "epoch": 0.017344, + "grad_norm": 1.390625, + "learning_rate": 9.905967741935485e-05, + "loss": 0.2195, + "step": 1084 + }, + { + "epoch": 0.01736, + "grad_norm": 1.0625, + "learning_rate": 9.905806451612904e-05, + "loss": 0.1691, + "step": 1085 + }, + { + "epoch": 0.017376, + "grad_norm": 1.109375, + "learning_rate": 9.905645161290324e-05, + "loss": 0.2114, + "step": 1086 + }, + { + "epoch": 0.017392, + "grad_norm": 0.95703125, + "learning_rate": 9.905483870967742e-05, + "loss": 0.1908, + "step": 1087 + }, + { + "epoch": 0.017408, + "grad_norm": 1.234375, + "learning_rate": 9.905322580645162e-05, + "loss": 0.1846, + "step": 1088 + }, + { + "epoch": 0.017424, + "grad_norm": 2.0625, + "learning_rate": 9.905161290322581e-05, + "loss": 0.2338, + "step": 1089 + }, + { + "epoch": 0.01744, + "grad_norm": 1.1875, + "learning_rate": 9.905000000000001e-05, + "loss": 0.2479, + "step": 1090 + }, + { + "epoch": 0.017456, + "grad_norm": 1.1171875, + "learning_rate": 9.90483870967742e-05, + "loss": 0.2415, + "step": 1091 + }, + { + "epoch": 0.017472, + "grad_norm": 1.109375, + "learning_rate": 9.90467741935484e-05, + "loss": 0.1839, + "step": 1092 + }, + { + "epoch": 0.017488, + "grad_norm": 1.3671875, + "learning_rate": 9.904516129032258e-05, + "loss": 0.2018, + "step": 1093 + }, + { + "epoch": 0.017504, + "grad_norm": 1.3125, + "learning_rate": 9.904354838709678e-05, + "loss": 0.2404, + "step": 1094 + }, + { + "epoch": 0.01752, + "grad_norm": 1.359375, + "learning_rate": 9.904193548387098e-05, + "loss": 0.1787, + "step": 1095 + }, + { + "epoch": 0.017536, + "grad_norm": 1.0703125, + "learning_rate": 9.904032258064517e-05, + "loss": 0.1907, + "step": 1096 + }, + { + "epoch": 0.017552, + "grad_norm": 1.2734375, + "learning_rate": 9.903870967741937e-05, + "loss": 0.2577, + "step": 1097 + }, + { + "epoch": 0.017568, + "grad_norm": 1.140625, + "learning_rate": 9.903709677419355e-05, + "loss": 0.2167, + "step": 1098 + }, + { + "epoch": 0.017584, + "grad_norm": 0.8359375, + "learning_rate": 9.903548387096775e-05, + "loss": 0.1922, + "step": 1099 + }, + { + "epoch": 0.0176, + "grad_norm": 1.140625, + "learning_rate": 9.903387096774194e-05, + "loss": 0.1884, + "step": 1100 + }, + { + "epoch": 0.017616, + "grad_norm": 0.93359375, + "learning_rate": 9.903225806451614e-05, + "loss": 0.1863, + "step": 1101 + }, + { + "epoch": 0.017632, + "grad_norm": 1.1640625, + "learning_rate": 9.903064516129032e-05, + "loss": 0.2127, + "step": 1102 + }, + { + "epoch": 0.017648, + "grad_norm": 1.3203125, + "learning_rate": 9.902903225806452e-05, + "loss": 0.2418, + "step": 1103 + }, + { + "epoch": 0.017664, + "grad_norm": 1.5078125, + "learning_rate": 9.902741935483871e-05, + "loss": 0.2109, + "step": 1104 + }, + { + "epoch": 0.01768, + "grad_norm": 1.4765625, + "learning_rate": 9.902580645161291e-05, + "loss": 0.2571, + "step": 1105 + }, + { + "epoch": 0.017696, + "grad_norm": 1.09375, + "learning_rate": 9.90241935483871e-05, + "loss": 0.1902, + "step": 1106 + }, + { + "epoch": 0.017712, + "grad_norm": 0.87109375, + "learning_rate": 9.90225806451613e-05, + "loss": 0.1861, + "step": 1107 + }, + { + "epoch": 0.017728, + "grad_norm": 1.015625, + "learning_rate": 9.90209677419355e-05, + "loss": 0.2162, + "step": 1108 + }, + { + "epoch": 0.017744, + "grad_norm": 1.25, + "learning_rate": 9.901935483870968e-05, + "loss": 0.206, + "step": 1109 + }, + { + "epoch": 0.01776, + "grad_norm": 1.1875, + "learning_rate": 9.901774193548388e-05, + "loss": 0.1942, + "step": 1110 + }, + { + "epoch": 0.017776, + "grad_norm": 1.125, + "learning_rate": 9.901612903225807e-05, + "loss": 0.248, + "step": 1111 + }, + { + "epoch": 0.017792, + "grad_norm": 1.0703125, + "learning_rate": 9.901451612903227e-05, + "loss": 0.1916, + "step": 1112 + }, + { + "epoch": 0.017808, + "grad_norm": 0.93359375, + "learning_rate": 9.901290322580645e-05, + "loss": 0.2186, + "step": 1113 + }, + { + "epoch": 0.017824, + "grad_norm": 1.796875, + "learning_rate": 9.901129032258065e-05, + "loss": 0.222, + "step": 1114 + }, + { + "epoch": 0.01784, + "grad_norm": 2.03125, + "learning_rate": 9.900967741935484e-05, + "loss": 0.216, + "step": 1115 + }, + { + "epoch": 0.017856, + "grad_norm": 0.9609375, + "learning_rate": 9.900806451612904e-05, + "loss": 0.2351, + "step": 1116 + }, + { + "epoch": 0.017872, + "grad_norm": 1.4140625, + "learning_rate": 9.900645161290322e-05, + "loss": 0.2087, + "step": 1117 + }, + { + "epoch": 0.017888, + "grad_norm": 1.015625, + "learning_rate": 9.900483870967742e-05, + "loss": 0.1896, + "step": 1118 + }, + { + "epoch": 0.017904, + "grad_norm": 0.95703125, + "learning_rate": 9.900322580645162e-05, + "loss": 0.2058, + "step": 1119 + }, + { + "epoch": 0.01792, + "grad_norm": 1.6015625, + "learning_rate": 9.900161290322582e-05, + "loss": 0.2242, + "step": 1120 + }, + { + "epoch": 0.017936, + "grad_norm": 1.2734375, + "learning_rate": 9.900000000000001e-05, + "loss": 0.2271, + "step": 1121 + }, + { + "epoch": 0.017952, + "grad_norm": 1.203125, + "learning_rate": 9.899838709677421e-05, + "loss": 0.2165, + "step": 1122 + }, + { + "epoch": 0.017968, + "grad_norm": 0.96484375, + "learning_rate": 9.89967741935484e-05, + "loss": 0.1905, + "step": 1123 + }, + { + "epoch": 0.017984, + "grad_norm": 0.95703125, + "learning_rate": 9.899516129032258e-05, + "loss": 0.1815, + "step": 1124 + }, + { + "epoch": 0.018, + "grad_norm": 1.0234375, + "learning_rate": 9.899354838709678e-05, + "loss": 0.2033, + "step": 1125 + }, + { + "epoch": 0.018016, + "grad_norm": 0.83203125, + "learning_rate": 9.899193548387097e-05, + "loss": 0.162, + "step": 1126 + }, + { + "epoch": 0.018032, + "grad_norm": 0.90234375, + "learning_rate": 9.899032258064516e-05, + "loss": 0.1874, + "step": 1127 + }, + { + "epoch": 0.018048, + "grad_norm": 0.78125, + "learning_rate": 9.898870967741935e-05, + "loss": 0.1711, + "step": 1128 + }, + { + "epoch": 0.018064, + "grad_norm": 1.078125, + "learning_rate": 9.898709677419355e-05, + "loss": 0.223, + "step": 1129 + }, + { + "epoch": 0.01808, + "grad_norm": 0.9140625, + "learning_rate": 9.898548387096775e-05, + "loss": 0.2065, + "step": 1130 + }, + { + "epoch": 0.018096, + "grad_norm": 0.99609375, + "learning_rate": 9.898387096774195e-05, + "loss": 0.2231, + "step": 1131 + }, + { + "epoch": 0.018112, + "grad_norm": 1.0, + "learning_rate": 9.898225806451614e-05, + "loss": 0.1947, + "step": 1132 + }, + { + "epoch": 0.018128, + "grad_norm": 1.140625, + "learning_rate": 9.898064516129034e-05, + "loss": 0.1846, + "step": 1133 + }, + { + "epoch": 0.018144, + "grad_norm": 1.3203125, + "learning_rate": 9.897903225806452e-05, + "loss": 0.2082, + "step": 1134 + }, + { + "epoch": 0.01816, + "grad_norm": 0.76171875, + "learning_rate": 9.897741935483872e-05, + "loss": 0.2343, + "step": 1135 + }, + { + "epoch": 0.018176, + "grad_norm": 1.546875, + "learning_rate": 9.897580645161291e-05, + "loss": 0.2419, + "step": 1136 + }, + { + "epoch": 0.018192, + "grad_norm": 1.0625, + "learning_rate": 9.897419354838711e-05, + "loss": 0.2, + "step": 1137 + }, + { + "epoch": 0.018208, + "grad_norm": 1.234375, + "learning_rate": 9.897258064516129e-05, + "loss": 0.2, + "step": 1138 + }, + { + "epoch": 0.018224, + "grad_norm": 0.88671875, + "learning_rate": 9.897096774193549e-05, + "loss": 0.2162, + "step": 1139 + }, + { + "epoch": 0.01824, + "grad_norm": 0.8203125, + "learning_rate": 9.896935483870968e-05, + "loss": 0.1667, + "step": 1140 + }, + { + "epoch": 0.018256, + "grad_norm": 1.1875, + "learning_rate": 9.896774193548386e-05, + "loss": 0.2513, + "step": 1141 + }, + { + "epoch": 0.018272, + "grad_norm": 1.109375, + "learning_rate": 9.896612903225806e-05, + "loss": 0.2293, + "step": 1142 + }, + { + "epoch": 0.018288, + "grad_norm": 0.98046875, + "learning_rate": 9.896451612903226e-05, + "loss": 0.2194, + "step": 1143 + }, + { + "epoch": 0.018304, + "grad_norm": 1.390625, + "learning_rate": 9.896290322580646e-05, + "loss": 0.1985, + "step": 1144 + }, + { + "epoch": 0.01832, + "grad_norm": 0.80859375, + "learning_rate": 9.896129032258065e-05, + "loss": 0.2246, + "step": 1145 + }, + { + "epoch": 0.018336, + "grad_norm": 1.203125, + "learning_rate": 9.895967741935485e-05, + "loss": 0.2066, + "step": 1146 + }, + { + "epoch": 0.018352, + "grad_norm": 1.171875, + "learning_rate": 9.895806451612904e-05, + "loss": 0.2621, + "step": 1147 + }, + { + "epoch": 0.018368, + "grad_norm": 0.8828125, + "learning_rate": 9.895645161290323e-05, + "loss": 0.1865, + "step": 1148 + }, + { + "epoch": 0.018384, + "grad_norm": 1.3515625, + "learning_rate": 9.895483870967742e-05, + "loss": 0.3197, + "step": 1149 + }, + { + "epoch": 0.0184, + "grad_norm": 1.5234375, + "learning_rate": 9.895322580645162e-05, + "loss": 0.2556, + "step": 1150 + }, + { + "epoch": 0.018416, + "grad_norm": 1.2265625, + "learning_rate": 9.89516129032258e-05, + "loss": 0.244, + "step": 1151 + }, + { + "epoch": 0.018432, + "grad_norm": 1.4375, + "learning_rate": 9.895e-05, + "loss": 0.2353, + "step": 1152 + }, + { + "epoch": 0.018448, + "grad_norm": 0.9296875, + "learning_rate": 9.894838709677419e-05, + "loss": 0.2094, + "step": 1153 + }, + { + "epoch": 0.018464, + "grad_norm": 0.98828125, + "learning_rate": 9.894677419354839e-05, + "loss": 0.2157, + "step": 1154 + }, + { + "epoch": 0.01848, + "grad_norm": 1.3671875, + "learning_rate": 9.894516129032259e-05, + "loss": 0.2361, + "step": 1155 + }, + { + "epoch": 0.018496, + "grad_norm": 1.0703125, + "learning_rate": 9.894354838709678e-05, + "loss": 0.1895, + "step": 1156 + }, + { + "epoch": 0.018512, + "grad_norm": 1.21875, + "learning_rate": 9.894193548387098e-05, + "loss": 0.2365, + "step": 1157 + }, + { + "epoch": 0.018528, + "grad_norm": 1.2734375, + "learning_rate": 9.894032258064516e-05, + "loss": 0.2125, + "step": 1158 + }, + { + "epoch": 0.018544, + "grad_norm": 1.3125, + "learning_rate": 9.893870967741936e-05, + "loss": 0.2057, + "step": 1159 + }, + { + "epoch": 0.01856, + "grad_norm": 1.40625, + "learning_rate": 9.893709677419355e-05, + "loss": 0.2017, + "step": 1160 + }, + { + "epoch": 0.018576, + "grad_norm": 1.3515625, + "learning_rate": 9.893548387096775e-05, + "loss": 0.2241, + "step": 1161 + }, + { + "epoch": 0.018592, + "grad_norm": 1.3046875, + "learning_rate": 9.893387096774193e-05, + "loss": 0.2166, + "step": 1162 + }, + { + "epoch": 0.018608, + "grad_norm": 1.2890625, + "learning_rate": 9.893225806451613e-05, + "loss": 0.1854, + "step": 1163 + }, + { + "epoch": 0.018624, + "grad_norm": 1.140625, + "learning_rate": 9.893064516129032e-05, + "loss": 0.2416, + "step": 1164 + }, + { + "epoch": 0.01864, + "grad_norm": 0.94140625, + "learning_rate": 9.892903225806452e-05, + "loss": 0.2084, + "step": 1165 + }, + { + "epoch": 0.018656, + "grad_norm": 1.3203125, + "learning_rate": 9.89274193548387e-05, + "loss": 0.2347, + "step": 1166 + }, + { + "epoch": 0.018672, + "grad_norm": 1.0078125, + "learning_rate": 9.89258064516129e-05, + "loss": 0.1768, + "step": 1167 + }, + { + "epoch": 0.018688, + "grad_norm": 0.8984375, + "learning_rate": 9.89241935483871e-05, + "loss": 0.2025, + "step": 1168 + }, + { + "epoch": 0.018704, + "grad_norm": 1.0625, + "learning_rate": 9.89225806451613e-05, + "loss": 0.1906, + "step": 1169 + }, + { + "epoch": 0.01872, + "grad_norm": 1.234375, + "learning_rate": 9.892096774193549e-05, + "loss": 0.2282, + "step": 1170 + }, + { + "epoch": 0.018736, + "grad_norm": 1.6484375, + "learning_rate": 9.891935483870968e-05, + "loss": 0.2181, + "step": 1171 + }, + { + "epoch": 0.018752, + "grad_norm": 0.90625, + "learning_rate": 9.891774193548388e-05, + "loss": 0.2009, + "step": 1172 + }, + { + "epoch": 0.018768, + "grad_norm": 0.890625, + "learning_rate": 9.891612903225806e-05, + "loss": 0.1886, + "step": 1173 + }, + { + "epoch": 0.018784, + "grad_norm": 1.046875, + "learning_rate": 9.891451612903226e-05, + "loss": 0.1975, + "step": 1174 + }, + { + "epoch": 0.0188, + "grad_norm": 0.9140625, + "learning_rate": 9.891290322580645e-05, + "loss": 0.1795, + "step": 1175 + }, + { + "epoch": 0.018816, + "grad_norm": 1.21875, + "learning_rate": 9.891129032258065e-05, + "loss": 0.2008, + "step": 1176 + }, + { + "epoch": 0.018832, + "grad_norm": 1.1171875, + "learning_rate": 9.890967741935483e-05, + "loss": 0.1956, + "step": 1177 + }, + { + "epoch": 0.018848, + "grad_norm": 1.265625, + "learning_rate": 9.890806451612903e-05, + "loss": 0.1974, + "step": 1178 + }, + { + "epoch": 0.018864, + "grad_norm": 0.9921875, + "learning_rate": 9.890645161290323e-05, + "loss": 0.2266, + "step": 1179 + }, + { + "epoch": 0.01888, + "grad_norm": 1.5625, + "learning_rate": 9.890483870967743e-05, + "loss": 0.2059, + "step": 1180 + }, + { + "epoch": 0.018896, + "grad_norm": 1.453125, + "learning_rate": 9.890322580645162e-05, + "loss": 0.2293, + "step": 1181 + }, + { + "epoch": 0.018912, + "grad_norm": 1.1875, + "learning_rate": 9.890161290322582e-05, + "loss": 0.2416, + "step": 1182 + }, + { + "epoch": 0.018928, + "grad_norm": 1.71875, + "learning_rate": 9.89e-05, + "loss": 0.2368, + "step": 1183 + }, + { + "epoch": 0.018944, + "grad_norm": 0.8515625, + "learning_rate": 9.88983870967742e-05, + "loss": 0.1421, + "step": 1184 + }, + { + "epoch": 0.01896, + "grad_norm": 1.625, + "learning_rate": 9.889677419354839e-05, + "loss": 0.2286, + "step": 1185 + }, + { + "epoch": 0.018976, + "grad_norm": 0.9453125, + "learning_rate": 9.889516129032258e-05, + "loss": 0.239, + "step": 1186 + }, + { + "epoch": 0.018992, + "grad_norm": 1.0625, + "learning_rate": 9.889354838709678e-05, + "loss": 0.1986, + "step": 1187 + }, + { + "epoch": 0.019008, + "grad_norm": 0.9140625, + "learning_rate": 9.889193548387096e-05, + "loss": 0.1791, + "step": 1188 + }, + { + "epoch": 0.019024, + "grad_norm": 1.6953125, + "learning_rate": 9.889032258064516e-05, + "loss": 0.2323, + "step": 1189 + }, + { + "epoch": 0.01904, + "grad_norm": 1.03125, + "learning_rate": 9.888870967741936e-05, + "loss": 0.2232, + "step": 1190 + }, + { + "epoch": 0.019056, + "grad_norm": 1.3828125, + "learning_rate": 9.888709677419356e-05, + "loss": 0.198, + "step": 1191 + }, + { + "epoch": 0.019072, + "grad_norm": 1.234375, + "learning_rate": 9.888548387096775e-05, + "loss": 0.2354, + "step": 1192 + }, + { + "epoch": 0.019088, + "grad_norm": 1.4921875, + "learning_rate": 9.888387096774195e-05, + "loss": 0.1872, + "step": 1193 + }, + { + "epoch": 0.019104, + "grad_norm": 1.0546875, + "learning_rate": 9.888225806451613e-05, + "loss": 0.1889, + "step": 1194 + }, + { + "epoch": 0.01912, + "grad_norm": 1.328125, + "learning_rate": 9.888064516129033e-05, + "loss": 0.1736, + "step": 1195 + }, + { + "epoch": 0.019136, + "grad_norm": 2.15625, + "learning_rate": 9.887903225806452e-05, + "loss": 0.2175, + "step": 1196 + }, + { + "epoch": 0.019152, + "grad_norm": 1.0703125, + "learning_rate": 9.887741935483872e-05, + "loss": 0.2099, + "step": 1197 + }, + { + "epoch": 0.019168, + "grad_norm": 1.7578125, + "learning_rate": 9.88758064516129e-05, + "loss": 0.2211, + "step": 1198 + }, + { + "epoch": 0.019184, + "grad_norm": 0.70703125, + "learning_rate": 9.88741935483871e-05, + "loss": 0.1707, + "step": 1199 + }, + { + "epoch": 0.0192, + "grad_norm": 1.0234375, + "learning_rate": 9.887258064516129e-05, + "loss": 0.1989, + "step": 1200 + }, + { + "epoch": 0.019216, + "grad_norm": 1.03125, + "learning_rate": 9.887096774193549e-05, + "loss": 0.1845, + "step": 1201 + }, + { + "epoch": 0.019232, + "grad_norm": 0.9921875, + "learning_rate": 9.886935483870968e-05, + "loss": 0.2616, + "step": 1202 + }, + { + "epoch": 0.019248, + "grad_norm": 1.875, + "learning_rate": 9.886774193548388e-05, + "loss": 0.2296, + "step": 1203 + }, + { + "epoch": 0.019264, + "grad_norm": 1.6953125, + "learning_rate": 9.886612903225808e-05, + "loss": 0.2217, + "step": 1204 + }, + { + "epoch": 0.01928, + "grad_norm": 1.7734375, + "learning_rate": 9.886451612903226e-05, + "loss": 0.2633, + "step": 1205 + }, + { + "epoch": 0.019296, + "grad_norm": 1.265625, + "learning_rate": 9.886290322580646e-05, + "loss": 0.194, + "step": 1206 + }, + { + "epoch": 0.019312, + "grad_norm": 1.0546875, + "learning_rate": 9.886129032258065e-05, + "loss": 0.1932, + "step": 1207 + }, + { + "epoch": 0.019328, + "grad_norm": 1.140625, + "learning_rate": 9.885967741935485e-05, + "loss": 0.1534, + "step": 1208 + }, + { + "epoch": 0.019344, + "grad_norm": 0.765625, + "learning_rate": 9.885806451612903e-05, + "loss": 0.1715, + "step": 1209 + }, + { + "epoch": 0.01936, + "grad_norm": 1.328125, + "learning_rate": 9.885645161290323e-05, + "loss": 0.1853, + "step": 1210 + }, + { + "epoch": 0.019376, + "grad_norm": 10.875, + "learning_rate": 9.885483870967742e-05, + "loss": 0.2495, + "step": 1211 + }, + { + "epoch": 0.019392, + "grad_norm": 2.46875, + "learning_rate": 9.885322580645162e-05, + "loss": 0.3524, + "step": 1212 + }, + { + "epoch": 0.019408, + "grad_norm": 2.09375, + "learning_rate": 9.88516129032258e-05, + "loss": 0.2729, + "step": 1213 + }, + { + "epoch": 0.019424, + "grad_norm": 3.671875, + "learning_rate": 9.885e-05, + "loss": 0.3281, + "step": 1214 + }, + { + "epoch": 0.01944, + "grad_norm": 2.25, + "learning_rate": 9.88483870967742e-05, + "loss": 0.2858, + "step": 1215 + }, + { + "epoch": 0.019456, + "grad_norm": 1.3359375, + "learning_rate": 9.88467741935484e-05, + "loss": 0.218, + "step": 1216 + }, + { + "epoch": 0.019472, + "grad_norm": 1.09375, + "learning_rate": 9.884516129032259e-05, + "loss": 0.2035, + "step": 1217 + }, + { + "epoch": 0.019488, + "grad_norm": 1.375, + "learning_rate": 9.884354838709678e-05, + "loss": 0.2551, + "step": 1218 + }, + { + "epoch": 0.019504, + "grad_norm": 1.609375, + "learning_rate": 9.884193548387097e-05, + "loss": 0.2445, + "step": 1219 + }, + { + "epoch": 0.01952, + "grad_norm": 1.1015625, + "learning_rate": 9.884032258064516e-05, + "loss": 0.2545, + "step": 1220 + }, + { + "epoch": 0.019536, + "grad_norm": 1.6875, + "learning_rate": 9.883870967741936e-05, + "loss": 0.2329, + "step": 1221 + }, + { + "epoch": 0.019552, + "grad_norm": 1.1484375, + "learning_rate": 9.883709677419355e-05, + "loss": 0.2113, + "step": 1222 + }, + { + "epoch": 0.019568, + "grad_norm": 1.3125, + "learning_rate": 9.883548387096775e-05, + "loss": 0.2868, + "step": 1223 + }, + { + "epoch": 0.019584, + "grad_norm": 1.2890625, + "learning_rate": 9.883387096774193e-05, + "loss": 0.2239, + "step": 1224 + }, + { + "epoch": 0.0196, + "grad_norm": 0.8828125, + "learning_rate": 9.883225806451613e-05, + "loss": 0.1683, + "step": 1225 + }, + { + "epoch": 0.019616, + "grad_norm": 1.15625, + "learning_rate": 9.883064516129033e-05, + "loss": 0.2237, + "step": 1226 + }, + { + "epoch": 0.019632, + "grad_norm": 9.1875, + "learning_rate": 9.882903225806452e-05, + "loss": 0.2141, + "step": 1227 + }, + { + "epoch": 0.019648, + "grad_norm": 1.8359375, + "learning_rate": 9.882741935483872e-05, + "loss": 0.2238, + "step": 1228 + }, + { + "epoch": 0.019664, + "grad_norm": 1.5234375, + "learning_rate": 9.882580645161292e-05, + "loss": 0.2205, + "step": 1229 + }, + { + "epoch": 0.01968, + "grad_norm": 0.84375, + "learning_rate": 9.88241935483871e-05, + "loss": 0.2032, + "step": 1230 + }, + { + "epoch": 0.019696, + "grad_norm": 1.53125, + "learning_rate": 9.88225806451613e-05, + "loss": 0.2567, + "step": 1231 + }, + { + "epoch": 0.019712, + "grad_norm": 1.53125, + "learning_rate": 9.882096774193549e-05, + "loss": 0.2728, + "step": 1232 + }, + { + "epoch": 0.019728, + "grad_norm": 1.2890625, + "learning_rate": 9.881935483870967e-05, + "loss": 0.2114, + "step": 1233 + }, + { + "epoch": 0.019744, + "grad_norm": 1.015625, + "learning_rate": 9.881774193548387e-05, + "loss": 0.2291, + "step": 1234 + }, + { + "epoch": 0.01976, + "grad_norm": 0.828125, + "learning_rate": 9.881612903225806e-05, + "loss": 0.2492, + "step": 1235 + }, + { + "epoch": 0.019776, + "grad_norm": 1.15625, + "learning_rate": 9.881451612903226e-05, + "loss": 0.2151, + "step": 1236 + }, + { + "epoch": 0.019792, + "grad_norm": 2.140625, + "learning_rate": 9.881290322580645e-05, + "loss": 0.2046, + "step": 1237 + }, + { + "epoch": 0.019808, + "grad_norm": 1.25, + "learning_rate": 9.881129032258065e-05, + "loss": 0.246, + "step": 1238 + }, + { + "epoch": 0.019824, + "grad_norm": 1.1796875, + "learning_rate": 9.880967741935485e-05, + "loss": 0.1915, + "step": 1239 + }, + { + "epoch": 0.01984, + "grad_norm": 1.140625, + "learning_rate": 9.880806451612905e-05, + "loss": 0.208, + "step": 1240 + }, + { + "epoch": 0.019856, + "grad_norm": 1.078125, + "learning_rate": 9.880645161290323e-05, + "loss": 0.2242, + "step": 1241 + }, + { + "epoch": 0.019872, + "grad_norm": 1.109375, + "learning_rate": 9.880483870967743e-05, + "loss": 0.2281, + "step": 1242 + }, + { + "epoch": 0.019888, + "grad_norm": 2.3125, + "learning_rate": 9.880322580645162e-05, + "loss": 0.2424, + "step": 1243 + }, + { + "epoch": 0.019904, + "grad_norm": 1.34375, + "learning_rate": 9.880161290322582e-05, + "loss": 0.2099, + "step": 1244 + }, + { + "epoch": 0.01992, + "grad_norm": 0.8046875, + "learning_rate": 9.88e-05, + "loss": 0.195, + "step": 1245 + }, + { + "epoch": 0.019936, + "grad_norm": 0.8515625, + "learning_rate": 9.87983870967742e-05, + "loss": 0.1791, + "step": 1246 + }, + { + "epoch": 0.019952, + "grad_norm": 0.9453125, + "learning_rate": 9.879677419354839e-05, + "loss": 0.2025, + "step": 1247 + }, + { + "epoch": 0.019968, + "grad_norm": 1.1875, + "learning_rate": 9.879516129032257e-05, + "loss": 0.229, + "step": 1248 + }, + { + "epoch": 0.019984, + "grad_norm": 1.171875, + "learning_rate": 9.879354838709677e-05, + "loss": 0.2816, + "step": 1249 + }, + { + "epoch": 0.02, + "grad_norm": 1.3671875, + "learning_rate": 9.879193548387097e-05, + "loss": 0.2522, + "step": 1250 + }, + { + "epoch": 0.020016, + "grad_norm": 1.4921875, + "learning_rate": 9.879032258064517e-05, + "loss": 0.1856, + "step": 1251 + }, + { + "epoch": 0.020032, + "grad_norm": 1.1171875, + "learning_rate": 9.878870967741936e-05, + "loss": 0.1926, + "step": 1252 + }, + { + "epoch": 0.020048, + "grad_norm": 0.9453125, + "learning_rate": 9.878709677419356e-05, + "loss": 0.1858, + "step": 1253 + }, + { + "epoch": 0.020064, + "grad_norm": 1.0859375, + "learning_rate": 9.878548387096775e-05, + "loss": 0.2152, + "step": 1254 + }, + { + "epoch": 0.02008, + "grad_norm": 1.5703125, + "learning_rate": 9.878387096774194e-05, + "loss": 0.2127, + "step": 1255 + }, + { + "epoch": 0.020096, + "grad_norm": 0.9921875, + "learning_rate": 9.878225806451613e-05, + "loss": 0.2178, + "step": 1256 + }, + { + "epoch": 0.020112, + "grad_norm": 1.1328125, + "learning_rate": 9.878064516129033e-05, + "loss": 0.2336, + "step": 1257 + }, + { + "epoch": 0.020128, + "grad_norm": 1.046875, + "learning_rate": 9.877903225806452e-05, + "loss": 0.1861, + "step": 1258 + }, + { + "epoch": 0.020144, + "grad_norm": 1.5625, + "learning_rate": 9.877741935483872e-05, + "loss": 0.2286, + "step": 1259 + }, + { + "epoch": 0.02016, + "grad_norm": 1.203125, + "learning_rate": 9.87758064516129e-05, + "loss": 0.194, + "step": 1260 + }, + { + "epoch": 0.020176, + "grad_norm": 1.09375, + "learning_rate": 9.87741935483871e-05, + "loss": 0.1988, + "step": 1261 + }, + { + "epoch": 0.020192, + "grad_norm": 1.2265625, + "learning_rate": 9.877258064516129e-05, + "loss": 0.2325, + "step": 1262 + }, + { + "epoch": 0.020208, + "grad_norm": 1.359375, + "learning_rate": 9.877096774193549e-05, + "loss": 0.2329, + "step": 1263 + }, + { + "epoch": 0.020224, + "grad_norm": 1.296875, + "learning_rate": 9.876935483870969e-05, + "loss": 0.26, + "step": 1264 + }, + { + "epoch": 0.02024, + "grad_norm": 1.03125, + "learning_rate": 9.876774193548387e-05, + "loss": 0.2088, + "step": 1265 + }, + { + "epoch": 0.020256, + "grad_norm": 1.2109375, + "learning_rate": 9.876612903225807e-05, + "loss": 0.2417, + "step": 1266 + }, + { + "epoch": 0.020272, + "grad_norm": 1.5390625, + "learning_rate": 9.876451612903226e-05, + "loss": 0.1728, + "step": 1267 + }, + { + "epoch": 0.020288, + "grad_norm": 1.4296875, + "learning_rate": 9.876290322580646e-05, + "loss": 0.2169, + "step": 1268 + }, + { + "epoch": 0.020304, + "grad_norm": 2.4375, + "learning_rate": 9.876129032258064e-05, + "loss": 0.2079, + "step": 1269 + }, + { + "epoch": 0.02032, + "grad_norm": 1.15625, + "learning_rate": 9.875967741935484e-05, + "loss": 0.2018, + "step": 1270 + }, + { + "epoch": 0.020336, + "grad_norm": 2.203125, + "learning_rate": 9.875806451612903e-05, + "loss": 0.2072, + "step": 1271 + }, + { + "epoch": 0.020352, + "grad_norm": 1.578125, + "learning_rate": 9.875645161290323e-05, + "loss": 0.2467, + "step": 1272 + }, + { + "epoch": 0.020368, + "grad_norm": 1.0625, + "learning_rate": 9.875483870967742e-05, + "loss": 0.188, + "step": 1273 + }, + { + "epoch": 0.020384, + "grad_norm": 2.875, + "learning_rate": 9.875322580645162e-05, + "loss": 0.2778, + "step": 1274 + }, + { + "epoch": 0.0204, + "grad_norm": 1.6796875, + "learning_rate": 9.875161290322582e-05, + "loss": 0.208, + "step": 1275 + }, + { + "epoch": 0.020416, + "grad_norm": 2.0, + "learning_rate": 9.875000000000002e-05, + "loss": 0.2241, + "step": 1276 + }, + { + "epoch": 0.020432, + "grad_norm": 0.734375, + "learning_rate": 9.87483870967742e-05, + "loss": 0.1965, + "step": 1277 + }, + { + "epoch": 0.020448, + "grad_norm": 1.8515625, + "learning_rate": 9.87467741935484e-05, + "loss": 0.2645, + "step": 1278 + }, + { + "epoch": 0.020464, + "grad_norm": 0.828125, + "learning_rate": 9.874516129032259e-05, + "loss": 0.1568, + "step": 1279 + }, + { + "epoch": 0.02048, + "grad_norm": 0.9921875, + "learning_rate": 9.874354838709677e-05, + "loss": 0.2537, + "step": 1280 + }, + { + "epoch": 0.020496, + "grad_norm": 1.1875, + "learning_rate": 9.874193548387097e-05, + "loss": 0.2086, + "step": 1281 + }, + { + "epoch": 0.020512, + "grad_norm": 1.203125, + "learning_rate": 9.874032258064516e-05, + "loss": 0.2338, + "step": 1282 + }, + { + "epoch": 0.020528, + "grad_norm": 2.203125, + "learning_rate": 9.873870967741936e-05, + "loss": 0.2799, + "step": 1283 + }, + { + "epoch": 0.020544, + "grad_norm": 0.859375, + "learning_rate": 9.873709677419354e-05, + "loss": 0.1875, + "step": 1284 + }, + { + "epoch": 0.02056, + "grad_norm": 1.2265625, + "learning_rate": 9.873548387096774e-05, + "loss": 0.2126, + "step": 1285 + }, + { + "epoch": 0.020576, + "grad_norm": 0.91796875, + "learning_rate": 9.873387096774194e-05, + "loss": 0.1947, + "step": 1286 + }, + { + "epoch": 0.020592, + "grad_norm": 0.96484375, + "learning_rate": 9.873225806451614e-05, + "loss": 0.2274, + "step": 1287 + }, + { + "epoch": 0.020608, + "grad_norm": 1.078125, + "learning_rate": 9.873064516129033e-05, + "loss": 0.1968, + "step": 1288 + }, + { + "epoch": 0.020624, + "grad_norm": 0.69921875, + "learning_rate": 9.872903225806453e-05, + "loss": 0.1658, + "step": 1289 + }, + { + "epoch": 0.02064, + "grad_norm": 1.1484375, + "learning_rate": 9.872741935483872e-05, + "loss": 0.1785, + "step": 1290 + }, + { + "epoch": 0.020656, + "grad_norm": 1.0234375, + "learning_rate": 9.872580645161291e-05, + "loss": 0.2014, + "step": 1291 + }, + { + "epoch": 0.020672, + "grad_norm": 1.265625, + "learning_rate": 9.87241935483871e-05, + "loss": 0.2351, + "step": 1292 + }, + { + "epoch": 0.020688, + "grad_norm": 1.046875, + "learning_rate": 9.87225806451613e-05, + "loss": 0.1873, + "step": 1293 + }, + { + "epoch": 0.020704, + "grad_norm": 1.1328125, + "learning_rate": 9.872096774193549e-05, + "loss": 0.1996, + "step": 1294 + }, + { + "epoch": 0.02072, + "grad_norm": 1.8203125, + "learning_rate": 9.871935483870967e-05, + "loss": 0.2115, + "step": 1295 + }, + { + "epoch": 0.020736, + "grad_norm": 0.81640625, + "learning_rate": 9.871774193548387e-05, + "loss": 0.1733, + "step": 1296 + }, + { + "epoch": 0.020752, + "grad_norm": 1.1171875, + "learning_rate": 9.871612903225806e-05, + "loss": 0.2223, + "step": 1297 + }, + { + "epoch": 0.020768, + "grad_norm": 1.5, + "learning_rate": 9.871451612903226e-05, + "loss": 0.2067, + "step": 1298 + }, + { + "epoch": 0.020784, + "grad_norm": 1.6484375, + "learning_rate": 9.871290322580646e-05, + "loss": 0.2126, + "step": 1299 + }, + { + "epoch": 0.0208, + "grad_norm": 1.2109375, + "learning_rate": 9.871129032258066e-05, + "loss": 0.2219, + "step": 1300 + }, + { + "epoch": 0.020816, + "grad_norm": 1.109375, + "learning_rate": 9.870967741935484e-05, + "loss": 0.2112, + "step": 1301 + }, + { + "epoch": 0.020832, + "grad_norm": 1.390625, + "learning_rate": 9.870806451612904e-05, + "loss": 0.2363, + "step": 1302 + }, + { + "epoch": 0.020848, + "grad_norm": 1.0859375, + "learning_rate": 9.870645161290323e-05, + "loss": 0.2462, + "step": 1303 + }, + { + "epoch": 0.020864, + "grad_norm": 0.890625, + "learning_rate": 9.870483870967743e-05, + "loss": 0.18, + "step": 1304 + }, + { + "epoch": 0.02088, + "grad_norm": 0.96875, + "learning_rate": 9.870322580645161e-05, + "loss": 0.2025, + "step": 1305 + }, + { + "epoch": 0.020896, + "grad_norm": 0.78125, + "learning_rate": 9.870161290322581e-05, + "loss": 0.2045, + "step": 1306 + }, + { + "epoch": 0.020912, + "grad_norm": 1.0390625, + "learning_rate": 9.87e-05, + "loss": 0.1742, + "step": 1307 + }, + { + "epoch": 0.020928, + "grad_norm": 0.90625, + "learning_rate": 9.86983870967742e-05, + "loss": 0.1951, + "step": 1308 + }, + { + "epoch": 0.020944, + "grad_norm": 1.0546875, + "learning_rate": 9.869677419354839e-05, + "loss": 0.2063, + "step": 1309 + }, + { + "epoch": 0.02096, + "grad_norm": 0.95703125, + "learning_rate": 9.869516129032259e-05, + "loss": 0.1812, + "step": 1310 + }, + { + "epoch": 0.020976, + "grad_norm": 0.93359375, + "learning_rate": 9.869354838709679e-05, + "loss": 0.1968, + "step": 1311 + }, + { + "epoch": 0.020992, + "grad_norm": 0.88671875, + "learning_rate": 9.869193548387097e-05, + "loss": 0.1855, + "step": 1312 + }, + { + "epoch": 0.021008, + "grad_norm": 1.1484375, + "learning_rate": 9.869032258064517e-05, + "loss": 0.2178, + "step": 1313 + }, + { + "epoch": 0.021024, + "grad_norm": 1.0546875, + "learning_rate": 9.868870967741936e-05, + "loss": 0.275, + "step": 1314 + }, + { + "epoch": 0.02104, + "grad_norm": 1.2265625, + "learning_rate": 9.868709677419356e-05, + "loss": 0.2058, + "step": 1315 + }, + { + "epoch": 0.021056, + "grad_norm": 1.0703125, + "learning_rate": 9.868548387096774e-05, + "loss": 0.1859, + "step": 1316 + }, + { + "epoch": 0.021072, + "grad_norm": 1.3359375, + "learning_rate": 9.868387096774194e-05, + "loss": 0.2666, + "step": 1317 + }, + { + "epoch": 0.021088, + "grad_norm": 1.1015625, + "learning_rate": 9.868225806451613e-05, + "loss": 0.2267, + "step": 1318 + }, + { + "epoch": 0.021104, + "grad_norm": 0.8125, + "learning_rate": 9.868064516129033e-05, + "loss": 0.2132, + "step": 1319 + }, + { + "epoch": 0.02112, + "grad_norm": 1.1953125, + "learning_rate": 9.867903225806451e-05, + "loss": 0.2255, + "step": 1320 + }, + { + "epoch": 0.021136, + "grad_norm": 0.71875, + "learning_rate": 9.867741935483871e-05, + "loss": 0.1854, + "step": 1321 + }, + { + "epoch": 0.021152, + "grad_norm": 1.0546875, + "learning_rate": 9.86758064516129e-05, + "loss": 0.1962, + "step": 1322 + }, + { + "epoch": 0.021168, + "grad_norm": 0.94140625, + "learning_rate": 9.86741935483871e-05, + "loss": 0.2275, + "step": 1323 + }, + { + "epoch": 0.021184, + "grad_norm": 1.0078125, + "learning_rate": 9.86725806451613e-05, + "loss": 0.2414, + "step": 1324 + }, + { + "epoch": 0.0212, + "grad_norm": 1.2109375, + "learning_rate": 9.86709677419355e-05, + "loss": 0.1907, + "step": 1325 + }, + { + "epoch": 0.021216, + "grad_norm": 1.046875, + "learning_rate": 9.866935483870968e-05, + "loss": 0.2165, + "step": 1326 + }, + { + "epoch": 0.021232, + "grad_norm": 2.59375, + "learning_rate": 9.866774193548387e-05, + "loss": 0.2103, + "step": 1327 + }, + { + "epoch": 0.021248, + "grad_norm": 1.1015625, + "learning_rate": 9.866612903225807e-05, + "loss": 0.2099, + "step": 1328 + }, + { + "epoch": 0.021264, + "grad_norm": 1.140625, + "learning_rate": 9.866451612903226e-05, + "loss": 0.2203, + "step": 1329 + }, + { + "epoch": 0.02128, + "grad_norm": 1.546875, + "learning_rate": 9.866290322580646e-05, + "loss": 0.2416, + "step": 1330 + }, + { + "epoch": 0.021296, + "grad_norm": 0.9765625, + "learning_rate": 9.866129032258064e-05, + "loss": 0.174, + "step": 1331 + }, + { + "epoch": 0.021312, + "grad_norm": 0.96875, + "learning_rate": 9.865967741935484e-05, + "loss": 0.2119, + "step": 1332 + }, + { + "epoch": 0.021328, + "grad_norm": 0.94140625, + "learning_rate": 9.865806451612903e-05, + "loss": 0.1738, + "step": 1333 + }, + { + "epoch": 0.021344, + "grad_norm": 1.0703125, + "learning_rate": 9.865645161290323e-05, + "loss": 0.1781, + "step": 1334 + }, + { + "epoch": 0.02136, + "grad_norm": 1.21875, + "learning_rate": 9.865483870967743e-05, + "loss": 0.277, + "step": 1335 + }, + { + "epoch": 0.021376, + "grad_norm": 1.484375, + "learning_rate": 9.865322580645163e-05, + "loss": 0.1941, + "step": 1336 + }, + { + "epoch": 0.021392, + "grad_norm": 1.0234375, + "learning_rate": 9.865161290322581e-05, + "loss": 0.2036, + "step": 1337 + }, + { + "epoch": 0.021408, + "grad_norm": 1.59375, + "learning_rate": 9.865000000000001e-05, + "loss": 0.2316, + "step": 1338 + }, + { + "epoch": 0.021424, + "grad_norm": 1.4609375, + "learning_rate": 9.86483870967742e-05, + "loss": 0.2361, + "step": 1339 + }, + { + "epoch": 0.02144, + "grad_norm": 1.171875, + "learning_rate": 9.86467741935484e-05, + "loss": 0.1855, + "step": 1340 + }, + { + "epoch": 0.021456, + "grad_norm": 1.0546875, + "learning_rate": 9.864516129032258e-05, + "loss": 0.2048, + "step": 1341 + }, + { + "epoch": 0.021472, + "grad_norm": 1.546875, + "learning_rate": 9.864354838709677e-05, + "loss": 0.2819, + "step": 1342 + }, + { + "epoch": 0.021488, + "grad_norm": 1.046875, + "learning_rate": 9.864193548387097e-05, + "loss": 0.2102, + "step": 1343 + }, + { + "epoch": 0.021504, + "grad_norm": 1.5859375, + "learning_rate": 9.864032258064516e-05, + "loss": 0.2203, + "step": 1344 + }, + { + "epoch": 0.02152, + "grad_norm": 0.8984375, + "learning_rate": 9.863870967741936e-05, + "loss": 0.1989, + "step": 1345 + }, + { + "epoch": 0.021536, + "grad_norm": 1.1484375, + "learning_rate": 9.863709677419356e-05, + "loss": 0.2025, + "step": 1346 + }, + { + "epoch": 0.021552, + "grad_norm": 1.0625, + "learning_rate": 9.863548387096776e-05, + "loss": 0.2192, + "step": 1347 + }, + { + "epoch": 0.021568, + "grad_norm": 1.03125, + "learning_rate": 9.863387096774194e-05, + "loss": 0.1933, + "step": 1348 + }, + { + "epoch": 0.021584, + "grad_norm": 1.234375, + "learning_rate": 9.863225806451614e-05, + "loss": 0.1964, + "step": 1349 + }, + { + "epoch": 0.0216, + "grad_norm": 1.390625, + "learning_rate": 9.863064516129033e-05, + "loss": 0.2593, + "step": 1350 + }, + { + "epoch": 0.021616, + "grad_norm": 0.97265625, + "learning_rate": 9.862903225806453e-05, + "loss": 0.2505, + "step": 1351 + }, + { + "epoch": 0.021632, + "grad_norm": 0.95703125, + "learning_rate": 9.862741935483871e-05, + "loss": 0.2206, + "step": 1352 + }, + { + "epoch": 0.021648, + "grad_norm": 1.1484375, + "learning_rate": 9.862580645161291e-05, + "loss": 0.2178, + "step": 1353 + }, + { + "epoch": 0.021664, + "grad_norm": 0.86328125, + "learning_rate": 9.86241935483871e-05, + "loss": 0.2162, + "step": 1354 + }, + { + "epoch": 0.02168, + "grad_norm": 1.0625, + "learning_rate": 9.86225806451613e-05, + "loss": 0.2629, + "step": 1355 + }, + { + "epoch": 0.021696, + "grad_norm": 1.1875, + "learning_rate": 9.862096774193548e-05, + "loss": 0.229, + "step": 1356 + }, + { + "epoch": 0.021712, + "grad_norm": 0.9296875, + "learning_rate": 9.861935483870967e-05, + "loss": 0.2015, + "step": 1357 + }, + { + "epoch": 0.021728, + "grad_norm": 1.0703125, + "learning_rate": 9.861774193548387e-05, + "loss": 0.1739, + "step": 1358 + }, + { + "epoch": 0.021744, + "grad_norm": 1.1015625, + "learning_rate": 9.861612903225807e-05, + "loss": 0.2015, + "step": 1359 + }, + { + "epoch": 0.02176, + "grad_norm": 0.88671875, + "learning_rate": 9.861451612903227e-05, + "loss": 0.2102, + "step": 1360 + }, + { + "epoch": 0.021776, + "grad_norm": 1.109375, + "learning_rate": 9.861290322580646e-05, + "loss": 0.2698, + "step": 1361 + }, + { + "epoch": 0.021792, + "grad_norm": 0.8125, + "learning_rate": 9.861129032258065e-05, + "loss": 0.1994, + "step": 1362 + }, + { + "epoch": 0.021808, + "grad_norm": 1.234375, + "learning_rate": 9.860967741935484e-05, + "loss": 0.1495, + "step": 1363 + }, + { + "epoch": 0.021824, + "grad_norm": 1.0859375, + "learning_rate": 9.860806451612904e-05, + "loss": 0.1874, + "step": 1364 + }, + { + "epoch": 0.02184, + "grad_norm": 1.4140625, + "learning_rate": 9.860645161290323e-05, + "loss": 0.2492, + "step": 1365 + }, + { + "epoch": 0.021856, + "grad_norm": 1.0234375, + "learning_rate": 9.860483870967743e-05, + "loss": 0.2164, + "step": 1366 + }, + { + "epoch": 0.021872, + "grad_norm": 0.984375, + "learning_rate": 9.860322580645161e-05, + "loss": 0.1751, + "step": 1367 + }, + { + "epoch": 0.021888, + "grad_norm": 1.0234375, + "learning_rate": 9.860161290322581e-05, + "loss": 0.1855, + "step": 1368 + }, + { + "epoch": 0.021904, + "grad_norm": 0.984375, + "learning_rate": 9.86e-05, + "loss": 0.1613, + "step": 1369 + }, + { + "epoch": 0.02192, + "grad_norm": 0.80859375, + "learning_rate": 9.85983870967742e-05, + "loss": 0.187, + "step": 1370 + }, + { + "epoch": 0.021936, + "grad_norm": 1.359375, + "learning_rate": 9.85967741935484e-05, + "loss": 0.2754, + "step": 1371 + }, + { + "epoch": 0.021952, + "grad_norm": 1.1640625, + "learning_rate": 9.85951612903226e-05, + "loss": 0.1967, + "step": 1372 + }, + { + "epoch": 0.021968, + "grad_norm": 0.98046875, + "learning_rate": 9.859354838709678e-05, + "loss": 0.1903, + "step": 1373 + }, + { + "epoch": 0.021984, + "grad_norm": 0.97265625, + "learning_rate": 9.859193548387097e-05, + "loss": 0.1872, + "step": 1374 + }, + { + "epoch": 0.022, + "grad_norm": 1.1484375, + "learning_rate": 9.859032258064517e-05, + "loss": 0.2426, + "step": 1375 + }, + { + "epoch": 0.022016, + "grad_norm": 1.1640625, + "learning_rate": 9.858870967741935e-05, + "loss": 0.1942, + "step": 1376 + }, + { + "epoch": 0.022032, + "grad_norm": 0.8359375, + "learning_rate": 9.858709677419355e-05, + "loss": 0.2074, + "step": 1377 + }, + { + "epoch": 0.022048, + "grad_norm": 1.1953125, + "learning_rate": 9.858548387096774e-05, + "loss": 0.2281, + "step": 1378 + }, + { + "epoch": 0.022064, + "grad_norm": 1.140625, + "learning_rate": 9.858387096774194e-05, + "loss": 0.1981, + "step": 1379 + }, + { + "epoch": 0.02208, + "grad_norm": 1.1328125, + "learning_rate": 9.858225806451613e-05, + "loss": 0.2193, + "step": 1380 + }, + { + "epoch": 0.022096, + "grad_norm": 1.9296875, + "learning_rate": 9.858064516129033e-05, + "loss": 0.2399, + "step": 1381 + }, + { + "epoch": 0.022112, + "grad_norm": 1.09375, + "learning_rate": 9.857903225806453e-05, + "loss": 0.1736, + "step": 1382 + }, + { + "epoch": 0.022128, + "grad_norm": 1.5390625, + "learning_rate": 9.857741935483872e-05, + "loss": 0.2203, + "step": 1383 + }, + { + "epoch": 0.022144, + "grad_norm": 1.0390625, + "learning_rate": 9.857580645161291e-05, + "loss": 0.205, + "step": 1384 + }, + { + "epoch": 0.02216, + "grad_norm": 1.234375, + "learning_rate": 9.857419354838711e-05, + "loss": 0.1782, + "step": 1385 + }, + { + "epoch": 0.022176, + "grad_norm": 0.72265625, + "learning_rate": 9.85725806451613e-05, + "loss": 0.1651, + "step": 1386 + }, + { + "epoch": 0.022192, + "grad_norm": 1.40625, + "learning_rate": 9.85709677419355e-05, + "loss": 0.2106, + "step": 1387 + }, + { + "epoch": 0.022208, + "grad_norm": 1.3203125, + "learning_rate": 9.856935483870968e-05, + "loss": 0.1944, + "step": 1388 + }, + { + "epoch": 0.022224, + "grad_norm": 0.65234375, + "learning_rate": 9.856774193548387e-05, + "loss": 0.1721, + "step": 1389 + }, + { + "epoch": 0.02224, + "grad_norm": 0.82421875, + "learning_rate": 9.856612903225807e-05, + "loss": 0.2068, + "step": 1390 + }, + { + "epoch": 0.022256, + "grad_norm": 1.0625, + "learning_rate": 9.856451612903225e-05, + "loss": 0.1947, + "step": 1391 + }, + { + "epoch": 0.022272, + "grad_norm": 1.1875, + "learning_rate": 9.856290322580645e-05, + "loss": 0.212, + "step": 1392 + }, + { + "epoch": 0.022288, + "grad_norm": 1.03125, + "learning_rate": 9.856129032258064e-05, + "loss": 0.2038, + "step": 1393 + }, + { + "epoch": 0.022304, + "grad_norm": 1.1171875, + "learning_rate": 9.855967741935484e-05, + "loss": 0.213, + "step": 1394 + }, + { + "epoch": 0.02232, + "grad_norm": 1.234375, + "learning_rate": 9.855806451612904e-05, + "loss": 0.2634, + "step": 1395 + }, + { + "epoch": 0.022336, + "grad_norm": 1.1015625, + "learning_rate": 9.855645161290324e-05, + "loss": 0.2154, + "step": 1396 + }, + { + "epoch": 0.022352, + "grad_norm": 0.91796875, + "learning_rate": 9.855483870967742e-05, + "loss": 0.2054, + "step": 1397 + }, + { + "epoch": 0.022368, + "grad_norm": 1.21875, + "learning_rate": 9.855322580645162e-05, + "loss": 0.2183, + "step": 1398 + }, + { + "epoch": 0.022384, + "grad_norm": 0.796875, + "learning_rate": 9.855161290322581e-05, + "loss": 0.1865, + "step": 1399 + }, + { + "epoch": 0.0224, + "grad_norm": 0.8828125, + "learning_rate": 9.855000000000001e-05, + "loss": 0.1985, + "step": 1400 + }, + { + "epoch": 0.022416, + "grad_norm": 1.0625, + "learning_rate": 9.85483870967742e-05, + "loss": 0.1847, + "step": 1401 + }, + { + "epoch": 0.022432, + "grad_norm": 0.8515625, + "learning_rate": 9.85467741935484e-05, + "loss": 0.2083, + "step": 1402 + }, + { + "epoch": 0.022448, + "grad_norm": 1.1796875, + "learning_rate": 9.854516129032258e-05, + "loss": 0.1997, + "step": 1403 + }, + { + "epoch": 0.022464, + "grad_norm": 0.953125, + "learning_rate": 9.854354838709677e-05, + "loss": 0.2236, + "step": 1404 + }, + { + "epoch": 0.02248, + "grad_norm": 2.828125, + "learning_rate": 9.854193548387097e-05, + "loss": 0.1798, + "step": 1405 + }, + { + "epoch": 0.022496, + "grad_norm": 0.9296875, + "learning_rate": 9.854032258064517e-05, + "loss": 0.1333, + "step": 1406 + }, + { + "epoch": 0.022512, + "grad_norm": 1.328125, + "learning_rate": 9.853870967741937e-05, + "loss": 0.2366, + "step": 1407 + }, + { + "epoch": 0.022528, + "grad_norm": 1.15625, + "learning_rate": 9.853709677419355e-05, + "loss": 0.2134, + "step": 1408 + }, + { + "epoch": 0.022544, + "grad_norm": 1.5078125, + "learning_rate": 9.853548387096775e-05, + "loss": 0.2551, + "step": 1409 + }, + { + "epoch": 0.02256, + "grad_norm": 1.0390625, + "learning_rate": 9.853387096774194e-05, + "loss": 0.2118, + "step": 1410 + }, + { + "epoch": 0.022576, + "grad_norm": 1.640625, + "learning_rate": 9.853225806451614e-05, + "loss": 0.2274, + "step": 1411 + }, + { + "epoch": 0.022592, + "grad_norm": 0.78515625, + "learning_rate": 9.853064516129032e-05, + "loss": 0.1802, + "step": 1412 + }, + { + "epoch": 0.022608, + "grad_norm": 0.82421875, + "learning_rate": 9.852903225806452e-05, + "loss": 0.1987, + "step": 1413 + }, + { + "epoch": 0.022624, + "grad_norm": 1.09375, + "learning_rate": 9.852741935483871e-05, + "loss": 0.2385, + "step": 1414 + }, + { + "epoch": 0.02264, + "grad_norm": 0.92578125, + "learning_rate": 9.852580645161291e-05, + "loss": 0.2018, + "step": 1415 + }, + { + "epoch": 0.022656, + "grad_norm": 1.5, + "learning_rate": 9.85241935483871e-05, + "loss": 0.2589, + "step": 1416 + }, + { + "epoch": 0.022672, + "grad_norm": 0.89453125, + "learning_rate": 9.85225806451613e-05, + "loss": 0.2131, + "step": 1417 + }, + { + "epoch": 0.022688, + "grad_norm": 1.546875, + "learning_rate": 9.852096774193548e-05, + "loss": 0.2475, + "step": 1418 + }, + { + "epoch": 0.022704, + "grad_norm": 1.0625, + "learning_rate": 9.851935483870968e-05, + "loss": 0.2284, + "step": 1419 + }, + { + "epoch": 0.02272, + "grad_norm": 1.1171875, + "learning_rate": 9.851774193548388e-05, + "loss": 0.201, + "step": 1420 + }, + { + "epoch": 0.022736, + "grad_norm": 1.1328125, + "learning_rate": 9.851612903225807e-05, + "loss": 0.2425, + "step": 1421 + }, + { + "epoch": 0.022752, + "grad_norm": 0.96875, + "learning_rate": 9.851451612903227e-05, + "loss": 0.22, + "step": 1422 + }, + { + "epoch": 0.022768, + "grad_norm": 1.546875, + "learning_rate": 9.851290322580645e-05, + "loss": 0.2073, + "step": 1423 + }, + { + "epoch": 0.022784, + "grad_norm": 1.4296875, + "learning_rate": 9.851129032258065e-05, + "loss": 0.1931, + "step": 1424 + }, + { + "epoch": 0.0228, + "grad_norm": 1.3359375, + "learning_rate": 9.850967741935484e-05, + "loss": 0.2086, + "step": 1425 + }, + { + "epoch": 0.022816, + "grad_norm": 1.40625, + "learning_rate": 9.850806451612904e-05, + "loss": 0.211, + "step": 1426 + }, + { + "epoch": 0.022832, + "grad_norm": 0.90234375, + "learning_rate": 9.850645161290322e-05, + "loss": 0.1723, + "step": 1427 + }, + { + "epoch": 0.022848, + "grad_norm": 0.61328125, + "learning_rate": 9.850483870967742e-05, + "loss": 0.1533, + "step": 1428 + }, + { + "epoch": 0.022864, + "grad_norm": 1.1796875, + "learning_rate": 9.850322580645161e-05, + "loss": 0.1922, + "step": 1429 + }, + { + "epoch": 0.02288, + "grad_norm": 2.40625, + "learning_rate": 9.850161290322581e-05, + "loss": 0.2259, + "step": 1430 + }, + { + "epoch": 0.022896, + "grad_norm": 0.79296875, + "learning_rate": 9.850000000000001e-05, + "loss": 0.1602, + "step": 1431 + }, + { + "epoch": 0.022912, + "grad_norm": 1.1640625, + "learning_rate": 9.849838709677421e-05, + "loss": 0.2284, + "step": 1432 + }, + { + "epoch": 0.022928, + "grad_norm": 0.94921875, + "learning_rate": 9.84967741935484e-05, + "loss": 0.2414, + "step": 1433 + }, + { + "epoch": 0.022944, + "grad_norm": 0.9375, + "learning_rate": 9.84951612903226e-05, + "loss": 0.2138, + "step": 1434 + }, + { + "epoch": 0.02296, + "grad_norm": 1.34375, + "learning_rate": 9.849354838709678e-05, + "loss": 0.2305, + "step": 1435 + }, + { + "epoch": 0.022976, + "grad_norm": 1.1484375, + "learning_rate": 9.849193548387097e-05, + "loss": 0.1618, + "step": 1436 + }, + { + "epoch": 0.022992, + "grad_norm": 1.84375, + "learning_rate": 9.849032258064517e-05, + "loss": 0.2218, + "step": 1437 + }, + { + "epoch": 0.023008, + "grad_norm": 1.0, + "learning_rate": 9.848870967741935e-05, + "loss": 0.2086, + "step": 1438 + }, + { + "epoch": 0.023024, + "grad_norm": 1.234375, + "learning_rate": 9.848709677419355e-05, + "loss": 0.2001, + "step": 1439 + }, + { + "epoch": 0.02304, + "grad_norm": 1.4140625, + "learning_rate": 9.848548387096774e-05, + "loss": 0.2285, + "step": 1440 + }, + { + "epoch": 0.023056, + "grad_norm": 1.671875, + "learning_rate": 9.848387096774194e-05, + "loss": 0.2566, + "step": 1441 + }, + { + "epoch": 0.023072, + "grad_norm": 1.75, + "learning_rate": 9.848225806451614e-05, + "loss": 0.2286, + "step": 1442 + }, + { + "epoch": 0.023088, + "grad_norm": 2.453125, + "learning_rate": 9.848064516129034e-05, + "loss": 0.2157, + "step": 1443 + }, + { + "epoch": 0.023104, + "grad_norm": 0.96484375, + "learning_rate": 9.847903225806452e-05, + "loss": 0.1822, + "step": 1444 + }, + { + "epoch": 0.02312, + "grad_norm": 0.84765625, + "learning_rate": 9.847741935483872e-05, + "loss": 0.1956, + "step": 1445 + }, + { + "epoch": 0.023136, + "grad_norm": 1.1171875, + "learning_rate": 9.847580645161291e-05, + "loss": 0.1766, + "step": 1446 + }, + { + "epoch": 0.023152, + "grad_norm": 1.65625, + "learning_rate": 9.847419354838711e-05, + "loss": 0.1824, + "step": 1447 + }, + { + "epoch": 0.023168, + "grad_norm": 1.09375, + "learning_rate": 9.84725806451613e-05, + "loss": 0.2189, + "step": 1448 + }, + { + "epoch": 0.023184, + "grad_norm": 1.1796875, + "learning_rate": 9.84709677419355e-05, + "loss": 0.1939, + "step": 1449 + }, + { + "epoch": 0.0232, + "grad_norm": 1.3984375, + "learning_rate": 9.846935483870968e-05, + "loss": 0.1788, + "step": 1450 + }, + { + "epoch": 0.023216, + "grad_norm": 1.0, + "learning_rate": 9.846774193548387e-05, + "loss": 0.2174, + "step": 1451 + }, + { + "epoch": 0.023232, + "grad_norm": 1.0234375, + "learning_rate": 9.846612903225807e-05, + "loss": 0.209, + "step": 1452 + }, + { + "epoch": 0.023248, + "grad_norm": 0.87890625, + "learning_rate": 9.846451612903225e-05, + "loss": 0.1543, + "step": 1453 + }, + { + "epoch": 0.023264, + "grad_norm": 2.203125, + "learning_rate": 9.846290322580645e-05, + "loss": 0.2573, + "step": 1454 + }, + { + "epoch": 0.02328, + "grad_norm": 1.3125, + "learning_rate": 9.846129032258065e-05, + "loss": 0.1883, + "step": 1455 + }, + { + "epoch": 0.023296, + "grad_norm": 1.3828125, + "learning_rate": 9.845967741935485e-05, + "loss": 0.2028, + "step": 1456 + }, + { + "epoch": 0.023312, + "grad_norm": 1.1015625, + "learning_rate": 9.845806451612904e-05, + "loss": 0.1891, + "step": 1457 + }, + { + "epoch": 0.023328, + "grad_norm": 2.296875, + "learning_rate": 9.845645161290324e-05, + "loss": 0.2167, + "step": 1458 + }, + { + "epoch": 0.023344, + "grad_norm": 0.953125, + "learning_rate": 9.845483870967742e-05, + "loss": 0.2111, + "step": 1459 + }, + { + "epoch": 0.02336, + "grad_norm": 0.91015625, + "learning_rate": 9.845322580645162e-05, + "loss": 0.2093, + "step": 1460 + }, + { + "epoch": 0.023376, + "grad_norm": 1.03125, + "learning_rate": 9.845161290322581e-05, + "loss": 0.159, + "step": 1461 + }, + { + "epoch": 0.023392, + "grad_norm": 0.7578125, + "learning_rate": 9.845000000000001e-05, + "loss": 0.1799, + "step": 1462 + }, + { + "epoch": 0.023408, + "grad_norm": 1.0390625, + "learning_rate": 9.84483870967742e-05, + "loss": 0.2259, + "step": 1463 + }, + { + "epoch": 0.023424, + "grad_norm": 1.5390625, + "learning_rate": 9.84467741935484e-05, + "loss": 0.1856, + "step": 1464 + }, + { + "epoch": 0.02344, + "grad_norm": 0.99609375, + "learning_rate": 9.844516129032258e-05, + "loss": 0.1861, + "step": 1465 + }, + { + "epoch": 0.023456, + "grad_norm": 1.2265625, + "learning_rate": 9.844354838709678e-05, + "loss": 0.1733, + "step": 1466 + }, + { + "epoch": 0.023472, + "grad_norm": 1.7265625, + "learning_rate": 9.844193548387098e-05, + "loss": 0.2427, + "step": 1467 + }, + { + "epoch": 0.023488, + "grad_norm": 1.3359375, + "learning_rate": 9.844032258064516e-05, + "loss": 0.2074, + "step": 1468 + }, + { + "epoch": 0.023504, + "grad_norm": 0.71484375, + "learning_rate": 9.843870967741936e-05, + "loss": 0.1367, + "step": 1469 + }, + { + "epoch": 0.02352, + "grad_norm": 1.2578125, + "learning_rate": 9.843709677419355e-05, + "loss": 0.2064, + "step": 1470 + }, + { + "epoch": 0.023536, + "grad_norm": 1.09375, + "learning_rate": 9.843548387096775e-05, + "loss": 0.2231, + "step": 1471 + }, + { + "epoch": 0.023552, + "grad_norm": 1.6484375, + "learning_rate": 9.843387096774194e-05, + "loss": 0.1915, + "step": 1472 + }, + { + "epoch": 0.023568, + "grad_norm": 2.421875, + "learning_rate": 9.843225806451614e-05, + "loss": 0.2051, + "step": 1473 + }, + { + "epoch": 0.023584, + "grad_norm": 1.296875, + "learning_rate": 9.843064516129032e-05, + "loss": 0.2322, + "step": 1474 + }, + { + "epoch": 0.0236, + "grad_norm": 0.9453125, + "learning_rate": 9.842903225806452e-05, + "loss": 0.1961, + "step": 1475 + }, + { + "epoch": 0.023616, + "grad_norm": 1.21875, + "learning_rate": 9.842741935483871e-05, + "loss": 0.247, + "step": 1476 + }, + { + "epoch": 0.023632, + "grad_norm": 1.28125, + "learning_rate": 9.842580645161291e-05, + "loss": 0.1914, + "step": 1477 + }, + { + "epoch": 0.023648, + "grad_norm": 0.78515625, + "learning_rate": 9.842419354838711e-05, + "loss": 0.1545, + "step": 1478 + }, + { + "epoch": 0.023664, + "grad_norm": 1.375, + "learning_rate": 9.842258064516129e-05, + "loss": 0.18, + "step": 1479 + }, + { + "epoch": 0.02368, + "grad_norm": 1.3671875, + "learning_rate": 9.842096774193549e-05, + "loss": 0.1917, + "step": 1480 + }, + { + "epoch": 0.023696, + "grad_norm": 0.79296875, + "learning_rate": 9.841935483870969e-05, + "loss": 0.188, + "step": 1481 + }, + { + "epoch": 0.023712, + "grad_norm": 0.859375, + "learning_rate": 9.841774193548388e-05, + "loss": 0.2182, + "step": 1482 + }, + { + "epoch": 0.023728, + "grad_norm": 1.2578125, + "learning_rate": 9.841612903225806e-05, + "loss": 0.2059, + "step": 1483 + }, + { + "epoch": 0.023744, + "grad_norm": 1.078125, + "learning_rate": 9.841451612903226e-05, + "loss": 0.2171, + "step": 1484 + }, + { + "epoch": 0.02376, + "grad_norm": 1.4375, + "learning_rate": 9.841290322580645e-05, + "loss": 0.2821, + "step": 1485 + }, + { + "epoch": 0.023776, + "grad_norm": 0.9609375, + "learning_rate": 9.841129032258065e-05, + "loss": 0.1813, + "step": 1486 + }, + { + "epoch": 0.023792, + "grad_norm": 1.046875, + "learning_rate": 9.840967741935484e-05, + "loss": 0.2118, + "step": 1487 + }, + { + "epoch": 0.023808, + "grad_norm": 1.1796875, + "learning_rate": 9.840806451612904e-05, + "loss": 0.1805, + "step": 1488 + }, + { + "epoch": 0.023824, + "grad_norm": 1.6953125, + "learning_rate": 9.840645161290322e-05, + "loss": 0.1822, + "step": 1489 + }, + { + "epoch": 0.02384, + "grad_norm": 0.76171875, + "learning_rate": 9.840483870967742e-05, + "loss": 0.1432, + "step": 1490 + }, + { + "epoch": 0.023856, + "grad_norm": 1.140625, + "learning_rate": 9.840322580645162e-05, + "loss": 0.2123, + "step": 1491 + }, + { + "epoch": 0.023872, + "grad_norm": 0.99609375, + "learning_rate": 9.840161290322582e-05, + "loss": 0.1769, + "step": 1492 + }, + { + "epoch": 0.023888, + "grad_norm": 2.03125, + "learning_rate": 9.84e-05, + "loss": 0.2663, + "step": 1493 + }, + { + "epoch": 0.023904, + "grad_norm": 1.0859375, + "learning_rate": 9.83983870967742e-05, + "loss": 0.2297, + "step": 1494 + }, + { + "epoch": 0.02392, + "grad_norm": 0.83203125, + "learning_rate": 9.839677419354839e-05, + "loss": 0.2046, + "step": 1495 + }, + { + "epoch": 0.023936, + "grad_norm": 1.1953125, + "learning_rate": 9.839516129032259e-05, + "loss": 0.2329, + "step": 1496 + }, + { + "epoch": 0.023952, + "grad_norm": 1.546875, + "learning_rate": 9.839354838709678e-05, + "loss": 0.2264, + "step": 1497 + }, + { + "epoch": 0.023968, + "grad_norm": 0.9921875, + "learning_rate": 9.839193548387096e-05, + "loss": 0.2092, + "step": 1498 + }, + { + "epoch": 0.023984, + "grad_norm": 0.69921875, + "learning_rate": 9.839032258064516e-05, + "loss": 0.1642, + "step": 1499 + }, + { + "epoch": 0.024, + "grad_norm": 0.88671875, + "learning_rate": 9.838870967741935e-05, + "loss": 0.1899, + "step": 1500 + }, + { + "epoch": 0.024016, + "grad_norm": 1.0546875, + "learning_rate": 9.838709677419355e-05, + "loss": 0.2023, + "step": 1501 + }, + { + "epoch": 0.024032, + "grad_norm": 1.2265625, + "learning_rate": 9.838548387096775e-05, + "loss": 0.2088, + "step": 1502 + }, + { + "epoch": 0.024048, + "grad_norm": 0.9296875, + "learning_rate": 9.838387096774195e-05, + "loss": 0.1867, + "step": 1503 + }, + { + "epoch": 0.024064, + "grad_norm": 1.0390625, + "learning_rate": 9.838225806451613e-05, + "loss": 0.2339, + "step": 1504 + }, + { + "epoch": 0.02408, + "grad_norm": 1.2109375, + "learning_rate": 9.838064516129033e-05, + "loss": 0.2068, + "step": 1505 + }, + { + "epoch": 0.024096, + "grad_norm": 0.8515625, + "learning_rate": 9.837903225806452e-05, + "loss": 0.192, + "step": 1506 + }, + { + "epoch": 0.024112, + "grad_norm": 0.671875, + "learning_rate": 9.837741935483872e-05, + "loss": 0.1467, + "step": 1507 + }, + { + "epoch": 0.024128, + "grad_norm": 1.2578125, + "learning_rate": 9.83758064516129e-05, + "loss": 0.2131, + "step": 1508 + }, + { + "epoch": 0.024144, + "grad_norm": 0.7109375, + "learning_rate": 9.83741935483871e-05, + "loss": 0.146, + "step": 1509 + }, + { + "epoch": 0.02416, + "grad_norm": 1.1640625, + "learning_rate": 9.837258064516129e-05, + "loss": 0.151, + "step": 1510 + }, + { + "epoch": 0.024176, + "grad_norm": 1.1953125, + "learning_rate": 9.837096774193549e-05, + "loss": 0.2315, + "step": 1511 + }, + { + "epoch": 0.024192, + "grad_norm": 1.171875, + "learning_rate": 9.836935483870968e-05, + "loss": 0.2117, + "step": 1512 + }, + { + "epoch": 0.024208, + "grad_norm": 0.890625, + "learning_rate": 9.836774193548386e-05, + "loss": 0.2022, + "step": 1513 + }, + { + "epoch": 0.024224, + "grad_norm": 0.8359375, + "learning_rate": 9.836612903225806e-05, + "loss": 0.169, + "step": 1514 + }, + { + "epoch": 0.02424, + "grad_norm": 1.28125, + "learning_rate": 9.836451612903226e-05, + "loss": 0.2377, + "step": 1515 + }, + { + "epoch": 0.024256, + "grad_norm": 0.92578125, + "learning_rate": 9.836290322580646e-05, + "loss": 0.214, + "step": 1516 + }, + { + "epoch": 0.024272, + "grad_norm": 0.94921875, + "learning_rate": 9.836129032258065e-05, + "loss": 0.1934, + "step": 1517 + }, + { + "epoch": 0.024288, + "grad_norm": 0.8671875, + "learning_rate": 9.835967741935485e-05, + "loss": 0.1949, + "step": 1518 + }, + { + "epoch": 0.024304, + "grad_norm": 0.96875, + "learning_rate": 9.835806451612903e-05, + "loss": 0.1909, + "step": 1519 + }, + { + "epoch": 0.02432, + "grad_norm": 1.3125, + "learning_rate": 9.835645161290323e-05, + "loss": 0.2023, + "step": 1520 + }, + { + "epoch": 0.024336, + "grad_norm": 0.81640625, + "learning_rate": 9.835483870967742e-05, + "loss": 0.172, + "step": 1521 + }, + { + "epoch": 0.024352, + "grad_norm": 1.0703125, + "learning_rate": 9.835322580645162e-05, + "loss": 0.2664, + "step": 1522 + }, + { + "epoch": 0.024368, + "grad_norm": 0.9765625, + "learning_rate": 9.83516129032258e-05, + "loss": 0.2331, + "step": 1523 + }, + { + "epoch": 0.024384, + "grad_norm": 1.015625, + "learning_rate": 9.835e-05, + "loss": 0.1848, + "step": 1524 + }, + { + "epoch": 0.0244, + "grad_norm": 1.0625, + "learning_rate": 9.834838709677419e-05, + "loss": 0.1807, + "step": 1525 + }, + { + "epoch": 0.024416, + "grad_norm": 1.4921875, + "learning_rate": 9.834677419354839e-05, + "loss": 0.2056, + "step": 1526 + }, + { + "epoch": 0.024432, + "grad_norm": 0.90625, + "learning_rate": 9.834516129032259e-05, + "loss": 0.1935, + "step": 1527 + }, + { + "epoch": 0.024448, + "grad_norm": 1.2265625, + "learning_rate": 9.834354838709679e-05, + "loss": 0.2479, + "step": 1528 + }, + { + "epoch": 0.024464, + "grad_norm": 0.97265625, + "learning_rate": 9.834193548387098e-05, + "loss": 0.2438, + "step": 1529 + }, + { + "epoch": 0.02448, + "grad_norm": 1.40625, + "learning_rate": 9.834032258064516e-05, + "loss": 0.226, + "step": 1530 + }, + { + "epoch": 0.024496, + "grad_norm": 1.1328125, + "learning_rate": 9.833870967741936e-05, + "loss": 0.1974, + "step": 1531 + }, + { + "epoch": 0.024512, + "grad_norm": 1.0625, + "learning_rate": 9.833709677419355e-05, + "loss": 0.1752, + "step": 1532 + }, + { + "epoch": 0.024528, + "grad_norm": 1.3203125, + "learning_rate": 9.833548387096775e-05, + "loss": 0.2073, + "step": 1533 + }, + { + "epoch": 0.024544, + "grad_norm": 1.40625, + "learning_rate": 9.833387096774193e-05, + "loss": 0.2098, + "step": 1534 + }, + { + "epoch": 0.02456, + "grad_norm": 1.2421875, + "learning_rate": 9.833225806451613e-05, + "loss": 0.176, + "step": 1535 + }, + { + "epoch": 0.024576, + "grad_norm": 1.0234375, + "learning_rate": 9.833064516129032e-05, + "loss": 0.1937, + "step": 1536 + }, + { + "epoch": 0.024592, + "grad_norm": 0.8984375, + "learning_rate": 9.832903225806452e-05, + "loss": 0.2549, + "step": 1537 + }, + { + "epoch": 0.024608, + "grad_norm": 1.71875, + "learning_rate": 9.832741935483872e-05, + "loss": 0.2285, + "step": 1538 + }, + { + "epoch": 0.024624, + "grad_norm": 0.7734375, + "learning_rate": 9.832580645161292e-05, + "loss": 0.1968, + "step": 1539 + }, + { + "epoch": 0.02464, + "grad_norm": 0.95703125, + "learning_rate": 9.83241935483871e-05, + "loss": 0.2239, + "step": 1540 + }, + { + "epoch": 0.024656, + "grad_norm": 1.265625, + "learning_rate": 9.83225806451613e-05, + "loss": 0.2023, + "step": 1541 + }, + { + "epoch": 0.024672, + "grad_norm": 1.609375, + "learning_rate": 9.832096774193549e-05, + "loss": 0.229, + "step": 1542 + }, + { + "epoch": 0.024688, + "grad_norm": 1.3046875, + "learning_rate": 9.831935483870969e-05, + "loss": 0.1827, + "step": 1543 + }, + { + "epoch": 0.024704, + "grad_norm": 1.015625, + "learning_rate": 9.831774193548388e-05, + "loss": 0.1608, + "step": 1544 + }, + { + "epoch": 0.02472, + "grad_norm": 1.2421875, + "learning_rate": 9.831612903225806e-05, + "loss": 0.1986, + "step": 1545 + }, + { + "epoch": 0.024736, + "grad_norm": 1.5078125, + "learning_rate": 9.831451612903226e-05, + "loss": 0.1639, + "step": 1546 + }, + { + "epoch": 0.024752, + "grad_norm": 1.4609375, + "learning_rate": 9.831290322580645e-05, + "loss": 0.2293, + "step": 1547 + }, + { + "epoch": 0.024768, + "grad_norm": 1.765625, + "learning_rate": 9.831129032258065e-05, + "loss": 0.2052, + "step": 1548 + }, + { + "epoch": 0.024784, + "grad_norm": 1.3359375, + "learning_rate": 9.830967741935483e-05, + "loss": 0.2525, + "step": 1549 + }, + { + "epoch": 0.0248, + "grad_norm": 1.7734375, + "learning_rate": 9.830806451612903e-05, + "loss": 0.2397, + "step": 1550 + }, + { + "epoch": 0.024816, + "grad_norm": 0.90625, + "learning_rate": 9.830645161290323e-05, + "loss": 0.1924, + "step": 1551 + }, + { + "epoch": 0.024832, + "grad_norm": 1.5234375, + "learning_rate": 9.830483870967743e-05, + "loss": 0.1936, + "step": 1552 + }, + { + "epoch": 0.024848, + "grad_norm": 0.875, + "learning_rate": 9.830322580645162e-05, + "loss": 0.1843, + "step": 1553 + }, + { + "epoch": 0.024864, + "grad_norm": 0.80859375, + "learning_rate": 9.830161290322582e-05, + "loss": 0.1741, + "step": 1554 + }, + { + "epoch": 0.02488, + "grad_norm": 0.7890625, + "learning_rate": 9.83e-05, + "loss": 0.1968, + "step": 1555 + }, + { + "epoch": 0.024896, + "grad_norm": 1.03125, + "learning_rate": 9.82983870967742e-05, + "loss": 0.1891, + "step": 1556 + }, + { + "epoch": 0.024912, + "grad_norm": 1.3046875, + "learning_rate": 9.829677419354839e-05, + "loss": 0.2462, + "step": 1557 + }, + { + "epoch": 0.024928, + "grad_norm": 1.265625, + "learning_rate": 9.829516129032259e-05, + "loss": 0.2187, + "step": 1558 + }, + { + "epoch": 0.024944, + "grad_norm": 0.890625, + "learning_rate": 9.829354838709678e-05, + "loss": 0.1892, + "step": 1559 + }, + { + "epoch": 0.02496, + "grad_norm": 0.953125, + "learning_rate": 9.829193548387096e-05, + "loss": 0.1972, + "step": 1560 + }, + { + "epoch": 0.024976, + "grad_norm": 0.78125, + "learning_rate": 9.829032258064516e-05, + "loss": 0.1544, + "step": 1561 + }, + { + "epoch": 0.024992, + "grad_norm": 1.0625, + "learning_rate": 9.828870967741936e-05, + "loss": 0.1786, + "step": 1562 + }, + { + "epoch": 0.025008, + "grad_norm": 0.8984375, + "learning_rate": 9.828709677419356e-05, + "loss": 0.2252, + "step": 1563 + }, + { + "epoch": 0.025024, + "grad_norm": 1.0859375, + "learning_rate": 9.828548387096775e-05, + "loss": 0.1739, + "step": 1564 + }, + { + "epoch": 0.02504, + "grad_norm": 0.94921875, + "learning_rate": 9.828387096774195e-05, + "loss": 0.186, + "step": 1565 + }, + { + "epoch": 0.025056, + "grad_norm": 1.0703125, + "learning_rate": 9.828225806451613e-05, + "loss": 0.1693, + "step": 1566 + }, + { + "epoch": 0.025072, + "grad_norm": 1.453125, + "learning_rate": 9.828064516129033e-05, + "loss": 0.2222, + "step": 1567 + }, + { + "epoch": 0.025088, + "grad_norm": 0.9140625, + "learning_rate": 9.827903225806452e-05, + "loss": 0.1998, + "step": 1568 + }, + { + "epoch": 0.025104, + "grad_norm": 0.69140625, + "learning_rate": 9.827741935483872e-05, + "loss": 0.1812, + "step": 1569 + }, + { + "epoch": 0.02512, + "grad_norm": 1.03125, + "learning_rate": 9.82758064516129e-05, + "loss": 0.201, + "step": 1570 + }, + { + "epoch": 0.025136, + "grad_norm": 1.2265625, + "learning_rate": 9.82741935483871e-05, + "loss": 0.2096, + "step": 1571 + }, + { + "epoch": 0.025152, + "grad_norm": 0.95703125, + "learning_rate": 9.827258064516129e-05, + "loss": 0.1821, + "step": 1572 + }, + { + "epoch": 0.025168, + "grad_norm": 0.98828125, + "learning_rate": 9.827096774193549e-05, + "loss": 0.1588, + "step": 1573 + }, + { + "epoch": 0.025184, + "grad_norm": 1.0625, + "learning_rate": 9.826935483870968e-05, + "loss": 0.2086, + "step": 1574 + }, + { + "epoch": 0.0252, + "grad_norm": 0.9375, + "learning_rate": 9.826774193548387e-05, + "loss": 0.1971, + "step": 1575 + }, + { + "epoch": 0.025216, + "grad_norm": 1.1640625, + "learning_rate": 9.826612903225807e-05, + "loss": 0.2057, + "step": 1576 + }, + { + "epoch": 0.025232, + "grad_norm": 0.82421875, + "learning_rate": 9.826451612903226e-05, + "loss": 0.208, + "step": 1577 + }, + { + "epoch": 0.025248, + "grad_norm": 2.046875, + "learning_rate": 9.826290322580646e-05, + "loss": 0.2286, + "step": 1578 + }, + { + "epoch": 0.025264, + "grad_norm": 1.2421875, + "learning_rate": 9.826129032258065e-05, + "loss": 0.2277, + "step": 1579 + }, + { + "epoch": 0.02528, + "grad_norm": 1.109375, + "learning_rate": 9.825967741935485e-05, + "loss": 0.2141, + "step": 1580 + }, + { + "epoch": 0.025296, + "grad_norm": 0.98046875, + "learning_rate": 9.825806451612903e-05, + "loss": 0.1927, + "step": 1581 + }, + { + "epoch": 0.025312, + "grad_norm": 1.109375, + "learning_rate": 9.825645161290323e-05, + "loss": 0.1873, + "step": 1582 + }, + { + "epoch": 0.025328, + "grad_norm": 1.1484375, + "learning_rate": 9.825483870967742e-05, + "loss": 0.2075, + "step": 1583 + }, + { + "epoch": 0.025344, + "grad_norm": 1.3515625, + "learning_rate": 9.825322580645162e-05, + "loss": 0.2004, + "step": 1584 + }, + { + "epoch": 0.02536, + "grad_norm": 0.87109375, + "learning_rate": 9.82516129032258e-05, + "loss": 0.1994, + "step": 1585 + }, + { + "epoch": 0.025376, + "grad_norm": 1.65625, + "learning_rate": 9.825e-05, + "loss": 0.2373, + "step": 1586 + }, + { + "epoch": 0.025392, + "grad_norm": 1.09375, + "learning_rate": 9.82483870967742e-05, + "loss": 0.1983, + "step": 1587 + }, + { + "epoch": 0.025408, + "grad_norm": 1.1171875, + "learning_rate": 9.82467741935484e-05, + "loss": 0.2305, + "step": 1588 + }, + { + "epoch": 0.025424, + "grad_norm": 0.984375, + "learning_rate": 9.824516129032259e-05, + "loss": 0.2187, + "step": 1589 + }, + { + "epoch": 0.02544, + "grad_norm": 1.0703125, + "learning_rate": 9.824354838709679e-05, + "loss": 0.1961, + "step": 1590 + }, + { + "epoch": 0.025456, + "grad_norm": 0.71484375, + "learning_rate": 9.824193548387097e-05, + "loss": 0.1776, + "step": 1591 + }, + { + "epoch": 0.025472, + "grad_norm": 1.2578125, + "learning_rate": 9.824032258064516e-05, + "loss": 0.2602, + "step": 1592 + }, + { + "epoch": 0.025488, + "grad_norm": 1.3203125, + "learning_rate": 9.823870967741936e-05, + "loss": 0.2294, + "step": 1593 + }, + { + "epoch": 0.025504, + "grad_norm": 0.88671875, + "learning_rate": 9.823709677419355e-05, + "loss": 0.1752, + "step": 1594 + }, + { + "epoch": 0.02552, + "grad_norm": 0.9609375, + "learning_rate": 9.823548387096775e-05, + "loss": 0.203, + "step": 1595 + }, + { + "epoch": 0.025536, + "grad_norm": 1.1015625, + "learning_rate": 9.823387096774193e-05, + "loss": 0.1865, + "step": 1596 + }, + { + "epoch": 0.025552, + "grad_norm": 0.99609375, + "learning_rate": 9.823225806451613e-05, + "loss": 0.1838, + "step": 1597 + }, + { + "epoch": 0.025568, + "grad_norm": 0.8046875, + "learning_rate": 9.823064516129033e-05, + "loss": 0.1768, + "step": 1598 + }, + { + "epoch": 0.025584, + "grad_norm": 0.98828125, + "learning_rate": 9.822903225806453e-05, + "loss": 0.191, + "step": 1599 + }, + { + "epoch": 0.0256, + "grad_norm": 0.91796875, + "learning_rate": 9.822741935483872e-05, + "loss": 0.2118, + "step": 1600 + }, + { + "epoch": 0.025616, + "grad_norm": 0.890625, + "learning_rate": 9.822580645161292e-05, + "loss": 0.2252, + "step": 1601 + }, + { + "epoch": 0.025632, + "grad_norm": 1.2734375, + "learning_rate": 9.82241935483871e-05, + "loss": 0.1469, + "step": 1602 + }, + { + "epoch": 0.025648, + "grad_norm": 0.96875, + "learning_rate": 9.82225806451613e-05, + "loss": 0.224, + "step": 1603 + }, + { + "epoch": 0.025664, + "grad_norm": 1.546875, + "learning_rate": 9.822096774193549e-05, + "loss": 0.2931, + "step": 1604 + }, + { + "epoch": 0.02568, + "grad_norm": 0.93359375, + "learning_rate": 9.821935483870969e-05, + "loss": 0.2206, + "step": 1605 + }, + { + "epoch": 0.025696, + "grad_norm": 1.0390625, + "learning_rate": 9.821774193548387e-05, + "loss": 0.1928, + "step": 1606 + }, + { + "epoch": 0.025712, + "grad_norm": 1.4921875, + "learning_rate": 9.821612903225806e-05, + "loss": 0.215, + "step": 1607 + }, + { + "epoch": 0.025728, + "grad_norm": 1.0703125, + "learning_rate": 9.821451612903226e-05, + "loss": 0.2043, + "step": 1608 + }, + { + "epoch": 0.025744, + "grad_norm": 0.75, + "learning_rate": 9.821290322580645e-05, + "loss": 0.1405, + "step": 1609 + }, + { + "epoch": 0.02576, + "grad_norm": 1.046875, + "learning_rate": 9.821129032258064e-05, + "loss": 0.217, + "step": 1610 + }, + { + "epoch": 0.025776, + "grad_norm": 0.8828125, + "learning_rate": 9.820967741935484e-05, + "loss": 0.1816, + "step": 1611 + }, + { + "epoch": 0.025792, + "grad_norm": 0.88671875, + "learning_rate": 9.820806451612904e-05, + "loss": 0.2084, + "step": 1612 + }, + { + "epoch": 0.025808, + "grad_norm": 1.1953125, + "learning_rate": 9.820645161290323e-05, + "loss": 0.2183, + "step": 1613 + }, + { + "epoch": 0.025824, + "grad_norm": 0.6796875, + "learning_rate": 9.820483870967743e-05, + "loss": 0.1889, + "step": 1614 + }, + { + "epoch": 0.02584, + "grad_norm": 0.9296875, + "learning_rate": 9.820322580645162e-05, + "loss": 0.1949, + "step": 1615 + }, + { + "epoch": 0.025856, + "grad_norm": 1.828125, + "learning_rate": 9.820161290322582e-05, + "loss": 0.2486, + "step": 1616 + }, + { + "epoch": 0.025872, + "grad_norm": 1.15625, + "learning_rate": 9.82e-05, + "loss": 0.1835, + "step": 1617 + }, + { + "epoch": 0.025888, + "grad_norm": 1.078125, + "learning_rate": 9.81983870967742e-05, + "loss": 0.225, + "step": 1618 + }, + { + "epoch": 0.025904, + "grad_norm": 0.9375, + "learning_rate": 9.819677419354839e-05, + "loss": 0.2105, + "step": 1619 + }, + { + "epoch": 0.02592, + "grad_norm": 1.171875, + "learning_rate": 9.819516129032259e-05, + "loss": 0.2211, + "step": 1620 + }, + { + "epoch": 0.025936, + "grad_norm": 1.0625, + "learning_rate": 9.819354838709677e-05, + "loss": 0.2004, + "step": 1621 + }, + { + "epoch": 0.025952, + "grad_norm": 0.94140625, + "learning_rate": 9.819193548387097e-05, + "loss": 0.1838, + "step": 1622 + }, + { + "epoch": 0.025968, + "grad_norm": 1.3046875, + "learning_rate": 9.819032258064517e-05, + "loss": 0.1788, + "step": 1623 + }, + { + "epoch": 0.025984, + "grad_norm": 1.0859375, + "learning_rate": 9.818870967741936e-05, + "loss": 0.2066, + "step": 1624 + }, + { + "epoch": 0.026, + "grad_norm": 1.0390625, + "learning_rate": 9.818709677419356e-05, + "loss": 0.1999, + "step": 1625 + }, + { + "epoch": 0.026016, + "grad_norm": 1.1484375, + "learning_rate": 9.818548387096774e-05, + "loss": 0.2007, + "step": 1626 + }, + { + "epoch": 0.026032, + "grad_norm": 1.0625, + "learning_rate": 9.818387096774194e-05, + "loss": 0.2021, + "step": 1627 + }, + { + "epoch": 0.026048, + "grad_norm": 0.85546875, + "learning_rate": 9.818225806451613e-05, + "loss": 0.2059, + "step": 1628 + }, + { + "epoch": 0.026064, + "grad_norm": 1.5859375, + "learning_rate": 9.818064516129033e-05, + "loss": 0.2242, + "step": 1629 + }, + { + "epoch": 0.02608, + "grad_norm": 0.98828125, + "learning_rate": 9.817903225806452e-05, + "loss": 0.178, + "step": 1630 + }, + { + "epoch": 0.026096, + "grad_norm": 0.91796875, + "learning_rate": 9.817741935483872e-05, + "loss": 0.2307, + "step": 1631 + }, + { + "epoch": 0.026112, + "grad_norm": 0.96484375, + "learning_rate": 9.81758064516129e-05, + "loss": 0.2026, + "step": 1632 + }, + { + "epoch": 0.026128, + "grad_norm": 0.93359375, + "learning_rate": 9.81741935483871e-05, + "loss": 0.2223, + "step": 1633 + }, + { + "epoch": 0.026144, + "grad_norm": 1.15625, + "learning_rate": 9.81725806451613e-05, + "loss": 0.2659, + "step": 1634 + }, + { + "epoch": 0.02616, + "grad_norm": 0.85546875, + "learning_rate": 9.81709677419355e-05, + "loss": 0.2013, + "step": 1635 + }, + { + "epoch": 0.026176, + "grad_norm": 1.1171875, + "learning_rate": 9.816935483870969e-05, + "loss": 0.2141, + "step": 1636 + }, + { + "epoch": 0.026192, + "grad_norm": 0.9296875, + "learning_rate": 9.816774193548389e-05, + "loss": 0.2156, + "step": 1637 + }, + { + "epoch": 0.026208, + "grad_norm": 0.921875, + "learning_rate": 9.816612903225807e-05, + "loss": 0.1857, + "step": 1638 + }, + { + "epoch": 0.026224, + "grad_norm": 0.85546875, + "learning_rate": 9.816451612903226e-05, + "loss": 0.19, + "step": 1639 + }, + { + "epoch": 0.02624, + "grad_norm": 1.0078125, + "learning_rate": 9.816290322580646e-05, + "loss": 0.1949, + "step": 1640 + }, + { + "epoch": 0.026256, + "grad_norm": 1.28125, + "learning_rate": 9.816129032258064e-05, + "loss": 0.1849, + "step": 1641 + }, + { + "epoch": 0.026272, + "grad_norm": 1.1484375, + "learning_rate": 9.815967741935484e-05, + "loss": 0.2305, + "step": 1642 + }, + { + "epoch": 0.026288, + "grad_norm": 0.70703125, + "learning_rate": 9.815806451612903e-05, + "loss": 0.2052, + "step": 1643 + }, + { + "epoch": 0.026304, + "grad_norm": 1.7421875, + "learning_rate": 9.815645161290323e-05, + "loss": 0.2207, + "step": 1644 + }, + { + "epoch": 0.02632, + "grad_norm": 1.1875, + "learning_rate": 9.815483870967742e-05, + "loss": 0.2172, + "step": 1645 + }, + { + "epoch": 0.026336, + "grad_norm": 1.0703125, + "learning_rate": 9.815322580645161e-05, + "loss": 0.2041, + "step": 1646 + }, + { + "epoch": 0.026352, + "grad_norm": 0.97265625, + "learning_rate": 9.815161290322581e-05, + "loss": 0.1637, + "step": 1647 + }, + { + "epoch": 0.026368, + "grad_norm": 1.1953125, + "learning_rate": 9.815000000000001e-05, + "loss": 0.1426, + "step": 1648 + }, + { + "epoch": 0.026384, + "grad_norm": 0.94140625, + "learning_rate": 9.81483870967742e-05, + "loss": 0.2069, + "step": 1649 + }, + { + "epoch": 0.0264, + "grad_norm": 1.0546875, + "learning_rate": 9.81467741935484e-05, + "loss": 0.2305, + "step": 1650 + }, + { + "epoch": 0.026416, + "grad_norm": 1.34375, + "learning_rate": 9.814516129032259e-05, + "loss": 0.1939, + "step": 1651 + }, + { + "epoch": 0.026432, + "grad_norm": 0.6953125, + "learning_rate": 9.814354838709679e-05, + "loss": 0.1793, + "step": 1652 + }, + { + "epoch": 0.026448, + "grad_norm": 1.1796875, + "learning_rate": 9.814193548387097e-05, + "loss": 0.2436, + "step": 1653 + }, + { + "epoch": 0.026464, + "grad_norm": 1.03125, + "learning_rate": 9.814032258064516e-05, + "loss": 0.2075, + "step": 1654 + }, + { + "epoch": 0.02648, + "grad_norm": 1.03125, + "learning_rate": 9.813870967741936e-05, + "loss": 0.2291, + "step": 1655 + }, + { + "epoch": 0.026496, + "grad_norm": 1.0, + "learning_rate": 9.813709677419354e-05, + "loss": 0.1972, + "step": 1656 + }, + { + "epoch": 0.026512, + "grad_norm": 1.125, + "learning_rate": 9.813548387096774e-05, + "loss": 0.2182, + "step": 1657 + }, + { + "epoch": 0.026528, + "grad_norm": 1.1171875, + "learning_rate": 9.813387096774194e-05, + "loss": 0.229, + "step": 1658 + }, + { + "epoch": 0.026544, + "grad_norm": 0.97265625, + "learning_rate": 9.813225806451614e-05, + "loss": 0.1912, + "step": 1659 + }, + { + "epoch": 0.02656, + "grad_norm": 0.8671875, + "learning_rate": 9.813064516129033e-05, + "loss": 0.2155, + "step": 1660 + }, + { + "epoch": 0.026576, + "grad_norm": 1.0625, + "learning_rate": 9.812903225806453e-05, + "loss": 0.2151, + "step": 1661 + }, + { + "epoch": 0.026592, + "grad_norm": 0.99609375, + "learning_rate": 9.812741935483871e-05, + "loss": 0.2028, + "step": 1662 + }, + { + "epoch": 0.026608, + "grad_norm": 1.1875, + "learning_rate": 9.812580645161291e-05, + "loss": 0.2139, + "step": 1663 + }, + { + "epoch": 0.026624, + "grad_norm": 1.21875, + "learning_rate": 9.81241935483871e-05, + "loss": 0.1622, + "step": 1664 + }, + { + "epoch": 0.02664, + "grad_norm": 1.0859375, + "learning_rate": 9.81225806451613e-05, + "loss": 0.2161, + "step": 1665 + }, + { + "epoch": 0.026656, + "grad_norm": 1.46875, + "learning_rate": 9.812096774193549e-05, + "loss": 0.2086, + "step": 1666 + }, + { + "epoch": 0.026672, + "grad_norm": 0.7734375, + "learning_rate": 9.811935483870969e-05, + "loss": 0.1411, + "step": 1667 + }, + { + "epoch": 0.026688, + "grad_norm": 2.0625, + "learning_rate": 9.811774193548387e-05, + "loss": 0.2012, + "step": 1668 + }, + { + "epoch": 0.026704, + "grad_norm": 1.6171875, + "learning_rate": 9.811612903225807e-05, + "loss": 0.2619, + "step": 1669 + }, + { + "epoch": 0.02672, + "grad_norm": 0.98828125, + "learning_rate": 9.811451612903226e-05, + "loss": 0.2051, + "step": 1670 + }, + { + "epoch": 0.026736, + "grad_norm": 0.7890625, + "learning_rate": 9.811290322580646e-05, + "loss": 0.1595, + "step": 1671 + }, + { + "epoch": 0.026752, + "grad_norm": 0.8203125, + "learning_rate": 9.811129032258066e-05, + "loss": 0.2004, + "step": 1672 + }, + { + "epoch": 0.026768, + "grad_norm": 1.0, + "learning_rate": 9.810967741935484e-05, + "loss": 0.2232, + "step": 1673 + }, + { + "epoch": 0.026784, + "grad_norm": 0.91796875, + "learning_rate": 9.810806451612904e-05, + "loss": 0.1634, + "step": 1674 + }, + { + "epoch": 0.0268, + "grad_norm": 0.79296875, + "learning_rate": 9.810645161290323e-05, + "loss": 0.2185, + "step": 1675 + }, + { + "epoch": 0.026816, + "grad_norm": 1.1484375, + "learning_rate": 9.810483870967743e-05, + "loss": 0.2003, + "step": 1676 + }, + { + "epoch": 0.026832, + "grad_norm": 1.2265625, + "learning_rate": 9.810322580645161e-05, + "loss": 0.1844, + "step": 1677 + }, + { + "epoch": 0.026848, + "grad_norm": 1.0546875, + "learning_rate": 9.810161290322581e-05, + "loss": 0.2184, + "step": 1678 + }, + { + "epoch": 0.026864, + "grad_norm": 1.1796875, + "learning_rate": 9.81e-05, + "loss": 0.2186, + "step": 1679 + }, + { + "epoch": 0.02688, + "grad_norm": 1.4921875, + "learning_rate": 9.80983870967742e-05, + "loss": 0.2411, + "step": 1680 + }, + { + "epoch": 0.026896, + "grad_norm": 0.9140625, + "learning_rate": 9.809677419354838e-05, + "loss": 0.2203, + "step": 1681 + }, + { + "epoch": 0.026912, + "grad_norm": 0.9609375, + "learning_rate": 9.809516129032258e-05, + "loss": 0.2144, + "step": 1682 + }, + { + "epoch": 0.026928, + "grad_norm": 0.99609375, + "learning_rate": 9.809354838709678e-05, + "loss": 0.1642, + "step": 1683 + }, + { + "epoch": 0.026944, + "grad_norm": 1.1484375, + "learning_rate": 9.809193548387097e-05, + "loss": 0.1732, + "step": 1684 + }, + { + "epoch": 0.02696, + "grad_norm": 0.87109375, + "learning_rate": 9.809032258064517e-05, + "loss": 0.2217, + "step": 1685 + }, + { + "epoch": 0.026976, + "grad_norm": 1.1796875, + "learning_rate": 9.808870967741936e-05, + "loss": 0.2109, + "step": 1686 + }, + { + "epoch": 0.026992, + "grad_norm": 1.125, + "learning_rate": 9.808709677419356e-05, + "loss": 0.2307, + "step": 1687 + }, + { + "epoch": 0.027008, + "grad_norm": 0.828125, + "learning_rate": 9.808548387096774e-05, + "loss": 0.2064, + "step": 1688 + }, + { + "epoch": 0.027024, + "grad_norm": 1.0625, + "learning_rate": 9.808387096774194e-05, + "loss": 0.2248, + "step": 1689 + }, + { + "epoch": 0.02704, + "grad_norm": 1.1640625, + "learning_rate": 9.808225806451613e-05, + "loss": 0.2243, + "step": 1690 + }, + { + "epoch": 0.027056, + "grad_norm": 0.67578125, + "learning_rate": 9.808064516129033e-05, + "loss": 0.1877, + "step": 1691 + }, + { + "epoch": 0.027072, + "grad_norm": 0.81640625, + "learning_rate": 9.807903225806451e-05, + "loss": 0.2001, + "step": 1692 + }, + { + "epoch": 0.027088, + "grad_norm": 1.171875, + "learning_rate": 9.807741935483871e-05, + "loss": 0.2487, + "step": 1693 + }, + { + "epoch": 0.027104, + "grad_norm": 0.8515625, + "learning_rate": 9.807580645161291e-05, + "loss": 0.1836, + "step": 1694 + }, + { + "epoch": 0.02712, + "grad_norm": 0.9921875, + "learning_rate": 9.807419354838711e-05, + "loss": 0.2171, + "step": 1695 + }, + { + "epoch": 0.027136, + "grad_norm": 1.0859375, + "learning_rate": 9.80725806451613e-05, + "loss": 0.2127, + "step": 1696 + }, + { + "epoch": 0.027152, + "grad_norm": 0.69140625, + "learning_rate": 9.80709677419355e-05, + "loss": 0.1924, + "step": 1697 + }, + { + "epoch": 0.027168, + "grad_norm": 1.2421875, + "learning_rate": 9.806935483870968e-05, + "loss": 0.2063, + "step": 1698 + }, + { + "epoch": 0.027184, + "grad_norm": 1.0234375, + "learning_rate": 9.806774193548388e-05, + "loss": 0.2352, + "step": 1699 + }, + { + "epoch": 0.0272, + "grad_norm": 0.95703125, + "learning_rate": 9.806612903225807e-05, + "loss": 0.24, + "step": 1700 + }, + { + "epoch": 0.027216, + "grad_norm": 1.0234375, + "learning_rate": 9.806451612903226e-05, + "loss": 0.2083, + "step": 1701 + }, + { + "epoch": 0.027232, + "grad_norm": 0.87109375, + "learning_rate": 9.806290322580646e-05, + "loss": 0.2088, + "step": 1702 + }, + { + "epoch": 0.027248, + "grad_norm": 0.859375, + "learning_rate": 9.806129032258064e-05, + "loss": 0.1798, + "step": 1703 + }, + { + "epoch": 0.027264, + "grad_norm": 1.2421875, + "learning_rate": 9.805967741935484e-05, + "loss": 0.2349, + "step": 1704 + }, + { + "epoch": 0.02728, + "grad_norm": 1.25, + "learning_rate": 9.805806451612903e-05, + "loss": 0.2027, + "step": 1705 + }, + { + "epoch": 0.027296, + "grad_norm": 0.9140625, + "learning_rate": 9.805645161290323e-05, + "loss": 0.2283, + "step": 1706 + }, + { + "epoch": 0.027312, + "grad_norm": 0.89453125, + "learning_rate": 9.805483870967743e-05, + "loss": 0.1683, + "step": 1707 + }, + { + "epoch": 0.027328, + "grad_norm": 1.296875, + "learning_rate": 9.805322580645163e-05, + "loss": 0.1903, + "step": 1708 + }, + { + "epoch": 0.027344, + "grad_norm": 1.109375, + "learning_rate": 9.805161290322581e-05, + "loss": 0.189, + "step": 1709 + }, + { + "epoch": 0.02736, + "grad_norm": 0.80859375, + "learning_rate": 9.805000000000001e-05, + "loss": 0.2322, + "step": 1710 + }, + { + "epoch": 0.027376, + "grad_norm": 1.796875, + "learning_rate": 9.80483870967742e-05, + "loss": 0.2027, + "step": 1711 + }, + { + "epoch": 0.027392, + "grad_norm": 1.0234375, + "learning_rate": 9.80467741935484e-05, + "loss": 0.2182, + "step": 1712 + }, + { + "epoch": 0.027408, + "grad_norm": 1.03125, + "learning_rate": 9.804516129032258e-05, + "loss": 0.1903, + "step": 1713 + }, + { + "epoch": 0.027424, + "grad_norm": 1.265625, + "learning_rate": 9.804354838709678e-05, + "loss": 0.2386, + "step": 1714 + }, + { + "epoch": 0.02744, + "grad_norm": 0.78125, + "learning_rate": 9.804193548387097e-05, + "loss": 0.2061, + "step": 1715 + }, + { + "epoch": 0.027456, + "grad_norm": 0.984375, + "learning_rate": 9.804032258064516e-05, + "loss": 0.1928, + "step": 1716 + }, + { + "epoch": 0.027472, + "grad_norm": 1.078125, + "learning_rate": 9.803870967741935e-05, + "loss": 0.2163, + "step": 1717 + }, + { + "epoch": 0.027488, + "grad_norm": 1.0, + "learning_rate": 9.803709677419355e-05, + "loss": 0.1952, + "step": 1718 + }, + { + "epoch": 0.027504, + "grad_norm": 0.8671875, + "learning_rate": 9.803548387096775e-05, + "loss": 0.1784, + "step": 1719 + }, + { + "epoch": 0.02752, + "grad_norm": 1.71875, + "learning_rate": 9.803387096774194e-05, + "loss": 0.1993, + "step": 1720 + }, + { + "epoch": 0.027536, + "grad_norm": 0.8984375, + "learning_rate": 9.803225806451614e-05, + "loss": 0.1999, + "step": 1721 + }, + { + "epoch": 0.027552, + "grad_norm": 1.546875, + "learning_rate": 9.803064516129033e-05, + "loss": 0.2403, + "step": 1722 + }, + { + "epoch": 0.027568, + "grad_norm": 0.91015625, + "learning_rate": 9.802903225806453e-05, + "loss": 0.2293, + "step": 1723 + }, + { + "epoch": 0.027584, + "grad_norm": 0.83984375, + "learning_rate": 9.802741935483871e-05, + "loss": 0.1799, + "step": 1724 + }, + { + "epoch": 0.0276, + "grad_norm": 0.97265625, + "learning_rate": 9.802580645161291e-05, + "loss": 0.1588, + "step": 1725 + }, + { + "epoch": 0.027616, + "grad_norm": 1.0078125, + "learning_rate": 9.80241935483871e-05, + "loss": 0.1818, + "step": 1726 + }, + { + "epoch": 0.027632, + "grad_norm": 0.73046875, + "learning_rate": 9.80225806451613e-05, + "loss": 0.2304, + "step": 1727 + }, + { + "epoch": 0.027648, + "grad_norm": 1.5390625, + "learning_rate": 9.802096774193548e-05, + "loss": 0.1952, + "step": 1728 + }, + { + "epoch": 0.027664, + "grad_norm": 1.359375, + "learning_rate": 9.801935483870968e-05, + "loss": 0.2407, + "step": 1729 + }, + { + "epoch": 0.02768, + "grad_norm": 0.93359375, + "learning_rate": 9.801774193548388e-05, + "loss": 0.2116, + "step": 1730 + }, + { + "epoch": 0.027696, + "grad_norm": 0.875, + "learning_rate": 9.801612903225807e-05, + "loss": 0.1896, + "step": 1731 + }, + { + "epoch": 0.027712, + "grad_norm": 1.3984375, + "learning_rate": 9.801451612903227e-05, + "loss": 0.2529, + "step": 1732 + }, + { + "epoch": 0.027728, + "grad_norm": 0.75, + "learning_rate": 9.801290322580645e-05, + "loss": 0.1994, + "step": 1733 + }, + { + "epoch": 0.027744, + "grad_norm": 0.80078125, + "learning_rate": 9.801129032258065e-05, + "loss": 0.1942, + "step": 1734 + }, + { + "epoch": 0.02776, + "grad_norm": 0.921875, + "learning_rate": 9.800967741935484e-05, + "loss": 0.2087, + "step": 1735 + }, + { + "epoch": 0.027776, + "grad_norm": 1.171875, + "learning_rate": 9.800806451612904e-05, + "loss": 0.1848, + "step": 1736 + }, + { + "epoch": 0.027792, + "grad_norm": 1.0078125, + "learning_rate": 9.800645161290323e-05, + "loss": 0.1522, + "step": 1737 + }, + { + "epoch": 0.027808, + "grad_norm": 1.671875, + "learning_rate": 9.800483870967743e-05, + "loss": 0.1989, + "step": 1738 + }, + { + "epoch": 0.027824, + "grad_norm": 1.6875, + "learning_rate": 9.800322580645161e-05, + "loss": 0.2521, + "step": 1739 + }, + { + "epoch": 0.02784, + "grad_norm": 1.2265625, + "learning_rate": 9.800161290322581e-05, + "loss": 0.2283, + "step": 1740 + }, + { + "epoch": 0.027856, + "grad_norm": 0.8671875, + "learning_rate": 9.8e-05, + "loss": 0.1802, + "step": 1741 + }, + { + "epoch": 0.027872, + "grad_norm": 0.9453125, + "learning_rate": 9.79983870967742e-05, + "loss": 0.2152, + "step": 1742 + }, + { + "epoch": 0.027888, + "grad_norm": 0.83984375, + "learning_rate": 9.79967741935484e-05, + "loss": 0.2264, + "step": 1743 + }, + { + "epoch": 0.027904, + "grad_norm": 0.671875, + "learning_rate": 9.79951612903226e-05, + "loss": 0.1812, + "step": 1744 + }, + { + "epoch": 0.02792, + "grad_norm": 1.328125, + "learning_rate": 9.799354838709678e-05, + "loss": 0.1807, + "step": 1745 + }, + { + "epoch": 0.027936, + "grad_norm": 0.84765625, + "learning_rate": 9.799193548387097e-05, + "loss": 0.1802, + "step": 1746 + }, + { + "epoch": 0.027952, + "grad_norm": 1.140625, + "learning_rate": 9.799032258064517e-05, + "loss": 0.2244, + "step": 1747 + }, + { + "epoch": 0.027968, + "grad_norm": 0.80078125, + "learning_rate": 9.798870967741935e-05, + "loss": 0.2025, + "step": 1748 + }, + { + "epoch": 0.027984, + "grad_norm": 1.1640625, + "learning_rate": 9.798709677419355e-05, + "loss": 0.2027, + "step": 1749 + }, + { + "epoch": 0.028, + "grad_norm": 1.1328125, + "learning_rate": 9.798548387096774e-05, + "loss": 0.1923, + "step": 1750 + }, + { + "epoch": 0.028016, + "grad_norm": 1.0, + "learning_rate": 9.798387096774194e-05, + "loss": 0.1765, + "step": 1751 + }, + { + "epoch": 0.028032, + "grad_norm": 1.0390625, + "learning_rate": 9.798225806451612e-05, + "loss": 0.1843, + "step": 1752 + }, + { + "epoch": 0.028048, + "grad_norm": 0.796875, + "learning_rate": 9.798064516129032e-05, + "loss": 0.2261, + "step": 1753 + }, + { + "epoch": 0.028064, + "grad_norm": 0.83203125, + "learning_rate": 9.797903225806452e-05, + "loss": 0.1798, + "step": 1754 + }, + { + "epoch": 0.02808, + "grad_norm": 1.3046875, + "learning_rate": 9.797741935483872e-05, + "loss": 0.2113, + "step": 1755 + }, + { + "epoch": 0.028096, + "grad_norm": 0.85546875, + "learning_rate": 9.797580645161291e-05, + "loss": 0.194, + "step": 1756 + }, + { + "epoch": 0.028112, + "grad_norm": 1.0703125, + "learning_rate": 9.797419354838711e-05, + "loss": 0.159, + "step": 1757 + }, + { + "epoch": 0.028128, + "grad_norm": 1.125, + "learning_rate": 9.79725806451613e-05, + "loss": 0.243, + "step": 1758 + }, + { + "epoch": 0.028144, + "grad_norm": 0.86328125, + "learning_rate": 9.79709677419355e-05, + "loss": 0.1904, + "step": 1759 + }, + { + "epoch": 0.02816, + "grad_norm": 0.69140625, + "learning_rate": 9.796935483870968e-05, + "loss": 0.2139, + "step": 1760 + }, + { + "epoch": 0.028176, + "grad_norm": 1.28125, + "learning_rate": 9.796774193548388e-05, + "loss": 0.2021, + "step": 1761 + }, + { + "epoch": 0.028192, + "grad_norm": 0.73828125, + "learning_rate": 9.796612903225807e-05, + "loss": 0.1727, + "step": 1762 + }, + { + "epoch": 0.028208, + "grad_norm": 1.328125, + "learning_rate": 9.796451612903225e-05, + "loss": 0.1774, + "step": 1763 + }, + { + "epoch": 0.028224, + "grad_norm": 0.9609375, + "learning_rate": 9.796290322580645e-05, + "loss": 0.2174, + "step": 1764 + }, + { + "epoch": 0.02824, + "grad_norm": 1.1640625, + "learning_rate": 9.796129032258064e-05, + "loss": 0.2303, + "step": 1765 + }, + { + "epoch": 0.028256, + "grad_norm": 0.9375, + "learning_rate": 9.795967741935484e-05, + "loss": 0.1828, + "step": 1766 + }, + { + "epoch": 0.028272, + "grad_norm": 0.8515625, + "learning_rate": 9.795806451612904e-05, + "loss": 0.2115, + "step": 1767 + }, + { + "epoch": 0.028288, + "grad_norm": 1.734375, + "learning_rate": 9.795645161290324e-05, + "loss": 0.2114, + "step": 1768 + }, + { + "epoch": 0.028304, + "grad_norm": 0.84375, + "learning_rate": 9.795483870967742e-05, + "loss": 0.1923, + "step": 1769 + }, + { + "epoch": 0.02832, + "grad_norm": 1.1328125, + "learning_rate": 9.795322580645162e-05, + "loss": 0.2084, + "step": 1770 + }, + { + "epoch": 0.028336, + "grad_norm": 1.03125, + "learning_rate": 9.795161290322581e-05, + "loss": 0.2269, + "step": 1771 + }, + { + "epoch": 0.028352, + "grad_norm": 0.89453125, + "learning_rate": 9.795000000000001e-05, + "loss": 0.1934, + "step": 1772 + }, + { + "epoch": 0.028368, + "grad_norm": 0.79296875, + "learning_rate": 9.79483870967742e-05, + "loss": 0.164, + "step": 1773 + }, + { + "epoch": 0.028384, + "grad_norm": 0.8671875, + "learning_rate": 9.79467741935484e-05, + "loss": 0.2096, + "step": 1774 + }, + { + "epoch": 0.0284, + "grad_norm": 0.74609375, + "learning_rate": 9.794516129032258e-05, + "loss": 0.1904, + "step": 1775 + }, + { + "epoch": 0.028416, + "grad_norm": 0.86328125, + "learning_rate": 9.794354838709678e-05, + "loss": 0.1813, + "step": 1776 + }, + { + "epoch": 0.028432, + "grad_norm": 1.015625, + "learning_rate": 9.794193548387097e-05, + "loss": 0.2026, + "step": 1777 + }, + { + "epoch": 0.028448, + "grad_norm": 1.1796875, + "learning_rate": 9.794032258064517e-05, + "loss": 0.2248, + "step": 1778 + }, + { + "epoch": 0.028464, + "grad_norm": 1.0, + "learning_rate": 9.793870967741937e-05, + "loss": 0.1975, + "step": 1779 + }, + { + "epoch": 0.02848, + "grad_norm": 0.93359375, + "learning_rate": 9.793709677419355e-05, + "loss": 0.2108, + "step": 1780 + }, + { + "epoch": 0.028496, + "grad_norm": 0.97265625, + "learning_rate": 9.793548387096775e-05, + "loss": 0.1758, + "step": 1781 + }, + { + "epoch": 0.028512, + "grad_norm": 0.63671875, + "learning_rate": 9.793387096774194e-05, + "loss": 0.1539, + "step": 1782 + }, + { + "epoch": 0.028528, + "grad_norm": 1.1484375, + "learning_rate": 9.793225806451614e-05, + "loss": 0.251, + "step": 1783 + }, + { + "epoch": 0.028544, + "grad_norm": 0.66015625, + "learning_rate": 9.793064516129032e-05, + "loss": 0.1547, + "step": 1784 + }, + { + "epoch": 0.02856, + "grad_norm": 0.69921875, + "learning_rate": 9.792903225806452e-05, + "loss": 0.1604, + "step": 1785 + }, + { + "epoch": 0.028576, + "grad_norm": 0.875, + "learning_rate": 9.792741935483871e-05, + "loss": 0.2064, + "step": 1786 + }, + { + "epoch": 0.028592, + "grad_norm": 0.890625, + "learning_rate": 9.792580645161291e-05, + "loss": 0.192, + "step": 1787 + }, + { + "epoch": 0.028608, + "grad_norm": 1.7109375, + "learning_rate": 9.79241935483871e-05, + "loss": 0.176, + "step": 1788 + }, + { + "epoch": 0.028624, + "grad_norm": 1.1484375, + "learning_rate": 9.79225806451613e-05, + "loss": 0.2129, + "step": 1789 + }, + { + "epoch": 0.02864, + "grad_norm": 1.078125, + "learning_rate": 9.79209677419355e-05, + "loss": 0.2106, + "step": 1790 + }, + { + "epoch": 0.028656, + "grad_norm": 0.875, + "learning_rate": 9.79193548387097e-05, + "loss": 0.1872, + "step": 1791 + }, + { + "epoch": 0.028672, + "grad_norm": 1.0703125, + "learning_rate": 9.791774193548388e-05, + "loss": 0.191, + "step": 1792 + }, + { + "epoch": 0.028688, + "grad_norm": 0.85546875, + "learning_rate": 9.791612903225807e-05, + "loss": 0.1752, + "step": 1793 + }, + { + "epoch": 0.028704, + "grad_norm": 0.78125, + "learning_rate": 9.791451612903227e-05, + "loss": 0.1573, + "step": 1794 + }, + { + "epoch": 0.02872, + "grad_norm": 1.7734375, + "learning_rate": 9.791290322580645e-05, + "loss": 0.2313, + "step": 1795 + }, + { + "epoch": 0.028736, + "grad_norm": 1.3671875, + "learning_rate": 9.791129032258065e-05, + "loss": 0.2015, + "step": 1796 + }, + { + "epoch": 0.028752, + "grad_norm": 1.09375, + "learning_rate": 9.790967741935484e-05, + "loss": 0.1847, + "step": 1797 + }, + { + "epoch": 0.028768, + "grad_norm": 0.546875, + "learning_rate": 9.790806451612904e-05, + "loss": 0.1293, + "step": 1798 + }, + { + "epoch": 0.028784, + "grad_norm": 0.85546875, + "learning_rate": 9.790645161290322e-05, + "loss": 0.1899, + "step": 1799 + }, + { + "epoch": 0.0288, + "grad_norm": 1.125, + "learning_rate": 9.790483870967742e-05, + "loss": 0.1912, + "step": 1800 + }, + { + "epoch": 0.028816, + "grad_norm": 0.828125, + "learning_rate": 9.790322580645161e-05, + "loss": 0.2247, + "step": 1801 + }, + { + "epoch": 0.028832, + "grad_norm": 0.85546875, + "learning_rate": 9.790161290322581e-05, + "loss": 0.2031, + "step": 1802 + }, + { + "epoch": 0.028848, + "grad_norm": 0.98046875, + "learning_rate": 9.790000000000001e-05, + "loss": 0.2128, + "step": 1803 + }, + { + "epoch": 0.028864, + "grad_norm": 0.82421875, + "learning_rate": 9.789838709677421e-05, + "loss": 0.1995, + "step": 1804 + }, + { + "epoch": 0.02888, + "grad_norm": 1.2109375, + "learning_rate": 9.78967741935484e-05, + "loss": 0.2204, + "step": 1805 + }, + { + "epoch": 0.028896, + "grad_norm": 0.8046875, + "learning_rate": 9.78951612903226e-05, + "loss": 0.2098, + "step": 1806 + }, + { + "epoch": 0.028912, + "grad_norm": 1.4609375, + "learning_rate": 9.789354838709678e-05, + "loss": 0.2001, + "step": 1807 + }, + { + "epoch": 0.028928, + "grad_norm": 0.87109375, + "learning_rate": 9.789193548387098e-05, + "loss": 0.218, + "step": 1808 + }, + { + "epoch": 0.028944, + "grad_norm": 1.15625, + "learning_rate": 9.789032258064517e-05, + "loss": 0.1852, + "step": 1809 + }, + { + "epoch": 0.02896, + "grad_norm": 1.3203125, + "learning_rate": 9.788870967741935e-05, + "loss": 0.1926, + "step": 1810 + }, + { + "epoch": 0.028976, + "grad_norm": 1.515625, + "learning_rate": 9.788709677419355e-05, + "loss": 0.1722, + "step": 1811 + }, + { + "epoch": 0.028992, + "grad_norm": 1.4765625, + "learning_rate": 9.788548387096774e-05, + "loss": 0.2001, + "step": 1812 + }, + { + "epoch": 0.029008, + "grad_norm": 0.93359375, + "learning_rate": 9.788387096774194e-05, + "loss": 0.1724, + "step": 1813 + }, + { + "epoch": 0.029024, + "grad_norm": 1.0625, + "learning_rate": 9.788225806451614e-05, + "loss": 0.19, + "step": 1814 + }, + { + "epoch": 0.02904, + "grad_norm": 1.4140625, + "learning_rate": 9.788064516129034e-05, + "loss": 0.2299, + "step": 1815 + }, + { + "epoch": 0.029056, + "grad_norm": 1.609375, + "learning_rate": 9.787903225806452e-05, + "loss": 0.1502, + "step": 1816 + }, + { + "epoch": 0.029072, + "grad_norm": 1.1640625, + "learning_rate": 9.787741935483872e-05, + "loss": 0.237, + "step": 1817 + }, + { + "epoch": 0.029088, + "grad_norm": 1.8828125, + "learning_rate": 9.787580645161291e-05, + "loss": 0.2401, + "step": 1818 + }, + { + "epoch": 0.029104, + "grad_norm": 1.2734375, + "learning_rate": 9.787419354838711e-05, + "loss": 0.1919, + "step": 1819 + }, + { + "epoch": 0.02912, + "grad_norm": 0.84375, + "learning_rate": 9.78725806451613e-05, + "loss": 0.2194, + "step": 1820 + }, + { + "epoch": 0.029136, + "grad_norm": 0.8125, + "learning_rate": 9.787096774193549e-05, + "loss": 0.2104, + "step": 1821 + }, + { + "epoch": 0.029152, + "grad_norm": 1.0546875, + "learning_rate": 9.786935483870968e-05, + "loss": 0.1965, + "step": 1822 + }, + { + "epoch": 0.029168, + "grad_norm": 1.6796875, + "learning_rate": 9.786774193548388e-05, + "loss": 0.2141, + "step": 1823 + }, + { + "epoch": 0.029184, + "grad_norm": 0.92578125, + "learning_rate": 9.786612903225806e-05, + "loss": 0.1884, + "step": 1824 + }, + { + "epoch": 0.0292, + "grad_norm": 1.359375, + "learning_rate": 9.786451612903226e-05, + "loss": 0.2308, + "step": 1825 + }, + { + "epoch": 0.029216, + "grad_norm": 0.80859375, + "learning_rate": 9.786290322580645e-05, + "loss": 0.2439, + "step": 1826 + }, + { + "epoch": 0.029232, + "grad_norm": 1.375, + "learning_rate": 9.786129032258065e-05, + "loss": 0.2275, + "step": 1827 + }, + { + "epoch": 0.029248, + "grad_norm": 0.7578125, + "learning_rate": 9.785967741935485e-05, + "loss": 0.1776, + "step": 1828 + }, + { + "epoch": 0.029264, + "grad_norm": 0.8203125, + "learning_rate": 9.785806451612904e-05, + "loss": 0.2132, + "step": 1829 + }, + { + "epoch": 0.02928, + "grad_norm": 1.1640625, + "learning_rate": 9.785645161290324e-05, + "loss": 0.1965, + "step": 1830 + }, + { + "epoch": 0.029296, + "grad_norm": 0.921875, + "learning_rate": 9.785483870967742e-05, + "loss": 0.1916, + "step": 1831 + }, + { + "epoch": 0.029312, + "grad_norm": 0.92578125, + "learning_rate": 9.785322580645162e-05, + "loss": 0.2114, + "step": 1832 + }, + { + "epoch": 0.029328, + "grad_norm": 1.2265625, + "learning_rate": 9.785161290322581e-05, + "loss": 0.1829, + "step": 1833 + }, + { + "epoch": 0.029344, + "grad_norm": 0.91796875, + "learning_rate": 9.785e-05, + "loss": 0.2123, + "step": 1834 + }, + { + "epoch": 0.02936, + "grad_norm": 0.9765625, + "learning_rate": 9.784838709677419e-05, + "loss": 0.1967, + "step": 1835 + }, + { + "epoch": 0.029376, + "grad_norm": 0.9609375, + "learning_rate": 9.784677419354839e-05, + "loss": 0.2058, + "step": 1836 + }, + { + "epoch": 0.029392, + "grad_norm": 0.7265625, + "learning_rate": 9.784516129032258e-05, + "loss": 0.1717, + "step": 1837 + }, + { + "epoch": 0.029408, + "grad_norm": 0.703125, + "learning_rate": 9.784354838709678e-05, + "loss": 0.1544, + "step": 1838 + }, + { + "epoch": 0.029424, + "grad_norm": 0.8203125, + "learning_rate": 9.784193548387098e-05, + "loss": 0.1969, + "step": 1839 + }, + { + "epoch": 0.02944, + "grad_norm": 0.98828125, + "learning_rate": 9.784032258064516e-05, + "loss": 0.1625, + "step": 1840 + }, + { + "epoch": 0.029456, + "grad_norm": 0.7109375, + "learning_rate": 9.783870967741936e-05, + "loss": 0.1776, + "step": 1841 + }, + { + "epoch": 0.029472, + "grad_norm": 1.046875, + "learning_rate": 9.783709677419355e-05, + "loss": 0.1728, + "step": 1842 + }, + { + "epoch": 0.029488, + "grad_norm": 0.8515625, + "learning_rate": 9.783548387096775e-05, + "loss": 0.2484, + "step": 1843 + }, + { + "epoch": 0.029504, + "grad_norm": 1.09375, + "learning_rate": 9.783387096774194e-05, + "loss": 0.2005, + "step": 1844 + }, + { + "epoch": 0.02952, + "grad_norm": 1.0703125, + "learning_rate": 9.783225806451613e-05, + "loss": 0.1811, + "step": 1845 + }, + { + "epoch": 0.029536, + "grad_norm": 1.6015625, + "learning_rate": 9.783064516129032e-05, + "loss": 0.2276, + "step": 1846 + }, + { + "epoch": 0.029552, + "grad_norm": 1.015625, + "learning_rate": 9.782903225806452e-05, + "loss": 0.1961, + "step": 1847 + }, + { + "epoch": 0.029568, + "grad_norm": 1.0234375, + "learning_rate": 9.78274193548387e-05, + "loss": 0.1933, + "step": 1848 + }, + { + "epoch": 0.029584, + "grad_norm": 0.921875, + "learning_rate": 9.78258064516129e-05, + "loss": 0.2368, + "step": 1849 + }, + { + "epoch": 0.0296, + "grad_norm": 0.98046875, + "learning_rate": 9.78241935483871e-05, + "loss": 0.1928, + "step": 1850 + }, + { + "epoch": 0.029616, + "grad_norm": 1.015625, + "learning_rate": 9.78225806451613e-05, + "loss": 0.195, + "step": 1851 + }, + { + "epoch": 0.029632, + "grad_norm": 0.828125, + "learning_rate": 9.782096774193549e-05, + "loss": 0.2099, + "step": 1852 + }, + { + "epoch": 0.029648, + "grad_norm": 0.94140625, + "learning_rate": 9.781935483870969e-05, + "loss": 0.1937, + "step": 1853 + }, + { + "epoch": 0.029664, + "grad_norm": 0.85546875, + "learning_rate": 9.781774193548388e-05, + "loss": 0.2397, + "step": 1854 + }, + { + "epoch": 0.02968, + "grad_norm": 0.87890625, + "learning_rate": 9.781612903225806e-05, + "loss": 0.1924, + "step": 1855 + }, + { + "epoch": 0.029696, + "grad_norm": 0.8125, + "learning_rate": 9.781451612903226e-05, + "loss": 0.2086, + "step": 1856 + }, + { + "epoch": 0.029712, + "grad_norm": 1.125, + "learning_rate": 9.781290322580645e-05, + "loss": 0.2051, + "step": 1857 + }, + { + "epoch": 0.029728, + "grad_norm": 0.89453125, + "learning_rate": 9.781129032258065e-05, + "loss": 0.1971, + "step": 1858 + }, + { + "epoch": 0.029744, + "grad_norm": 0.8671875, + "learning_rate": 9.780967741935483e-05, + "loss": 0.1868, + "step": 1859 + }, + { + "epoch": 0.02976, + "grad_norm": 1.15625, + "learning_rate": 9.780806451612903e-05, + "loss": 0.1709, + "step": 1860 + }, + { + "epoch": 0.029776, + "grad_norm": 1.2265625, + "learning_rate": 9.780645161290322e-05, + "loss": 0.229, + "step": 1861 + }, + { + "epoch": 0.029792, + "grad_norm": 0.859375, + "learning_rate": 9.780483870967742e-05, + "loss": 0.1855, + "step": 1862 + }, + { + "epoch": 0.029808, + "grad_norm": 0.9921875, + "learning_rate": 9.780322580645162e-05, + "loss": 0.1571, + "step": 1863 + }, + { + "epoch": 0.029824, + "grad_norm": 1.03125, + "learning_rate": 9.780161290322582e-05, + "loss": 0.1968, + "step": 1864 + }, + { + "epoch": 0.02984, + "grad_norm": 1.2890625, + "learning_rate": 9.78e-05, + "loss": 0.1775, + "step": 1865 + }, + { + "epoch": 0.029856, + "grad_norm": 0.84765625, + "learning_rate": 9.77983870967742e-05, + "loss": 0.2487, + "step": 1866 + }, + { + "epoch": 0.029872, + "grad_norm": 1.0625, + "learning_rate": 9.779677419354839e-05, + "loss": 0.1931, + "step": 1867 + }, + { + "epoch": 0.029888, + "grad_norm": 0.7734375, + "learning_rate": 9.779516129032259e-05, + "loss": 0.1803, + "step": 1868 + }, + { + "epoch": 0.029904, + "grad_norm": 0.671875, + "learning_rate": 9.779354838709678e-05, + "loss": 0.1581, + "step": 1869 + }, + { + "epoch": 0.02992, + "grad_norm": 0.76171875, + "learning_rate": 9.779193548387098e-05, + "loss": 0.168, + "step": 1870 + }, + { + "epoch": 0.029936, + "grad_norm": 1.140625, + "learning_rate": 9.779032258064516e-05, + "loss": 0.2035, + "step": 1871 + }, + { + "epoch": 0.029952, + "grad_norm": 1.1796875, + "learning_rate": 9.778870967741935e-05, + "loss": 0.1953, + "step": 1872 + }, + { + "epoch": 0.029968, + "grad_norm": 1.171875, + "learning_rate": 9.778709677419355e-05, + "loss": 0.1685, + "step": 1873 + }, + { + "epoch": 0.029984, + "grad_norm": 0.98828125, + "learning_rate": 9.778548387096775e-05, + "loss": 0.2401, + "step": 1874 + }, + { + "epoch": 0.03, + "grad_norm": 1.3203125, + "learning_rate": 9.778387096774195e-05, + "loss": 0.216, + "step": 1875 + }, + { + "epoch": 0.030016, + "grad_norm": 1.1953125, + "learning_rate": 9.778225806451613e-05, + "loss": 0.2555, + "step": 1876 + }, + { + "epoch": 0.030032, + "grad_norm": 0.98046875, + "learning_rate": 9.778064516129033e-05, + "loss": 0.2078, + "step": 1877 + }, + { + "epoch": 0.030048, + "grad_norm": 0.9296875, + "learning_rate": 9.777903225806452e-05, + "loss": 0.2042, + "step": 1878 + }, + { + "epoch": 0.030064, + "grad_norm": 1.3203125, + "learning_rate": 9.777741935483872e-05, + "loss": 0.1949, + "step": 1879 + }, + { + "epoch": 0.03008, + "grad_norm": 0.89453125, + "learning_rate": 9.77758064516129e-05, + "loss": 0.1803, + "step": 1880 + }, + { + "epoch": 0.030096, + "grad_norm": 0.82421875, + "learning_rate": 9.77741935483871e-05, + "loss": 0.1911, + "step": 1881 + }, + { + "epoch": 0.030112, + "grad_norm": 0.80078125, + "learning_rate": 9.777258064516129e-05, + "loss": 0.1609, + "step": 1882 + }, + { + "epoch": 0.030128, + "grad_norm": 1.1796875, + "learning_rate": 9.777096774193549e-05, + "loss": 0.1881, + "step": 1883 + }, + { + "epoch": 0.030144, + "grad_norm": 0.89453125, + "learning_rate": 9.776935483870968e-05, + "loss": 0.1991, + "step": 1884 + }, + { + "epoch": 0.03016, + "grad_norm": 0.90234375, + "learning_rate": 9.776774193548388e-05, + "loss": 0.2141, + "step": 1885 + }, + { + "epoch": 0.030176, + "grad_norm": 1.0859375, + "learning_rate": 9.776612903225808e-05, + "loss": 0.177, + "step": 1886 + }, + { + "epoch": 0.030192, + "grad_norm": 0.890625, + "learning_rate": 9.776451612903226e-05, + "loss": 0.1866, + "step": 1887 + }, + { + "epoch": 0.030208, + "grad_norm": 0.875, + "learning_rate": 9.776290322580646e-05, + "loss": 0.1801, + "step": 1888 + }, + { + "epoch": 0.030224, + "grad_norm": 0.87890625, + "learning_rate": 9.776129032258065e-05, + "loss": 0.1893, + "step": 1889 + }, + { + "epoch": 0.03024, + "grad_norm": 0.87890625, + "learning_rate": 9.775967741935485e-05, + "loss": 0.1829, + "step": 1890 + }, + { + "epoch": 0.030256, + "grad_norm": 0.76171875, + "learning_rate": 9.775806451612903e-05, + "loss": 0.2079, + "step": 1891 + }, + { + "epoch": 0.030272, + "grad_norm": 0.83984375, + "learning_rate": 9.775645161290323e-05, + "loss": 0.2018, + "step": 1892 + }, + { + "epoch": 0.030288, + "grad_norm": 0.921875, + "learning_rate": 9.775483870967742e-05, + "loss": 0.2375, + "step": 1893 + }, + { + "epoch": 0.030304, + "grad_norm": 0.91015625, + "learning_rate": 9.775322580645162e-05, + "loss": 0.2, + "step": 1894 + }, + { + "epoch": 0.03032, + "grad_norm": 0.75, + "learning_rate": 9.77516129032258e-05, + "loss": 0.1587, + "step": 1895 + }, + { + "epoch": 0.030336, + "grad_norm": 1.0078125, + "learning_rate": 9.775e-05, + "loss": 0.214, + "step": 1896 + }, + { + "epoch": 0.030352, + "grad_norm": 0.85546875, + "learning_rate": 9.774838709677419e-05, + "loss": 0.2185, + "step": 1897 + }, + { + "epoch": 0.030368, + "grad_norm": 0.99609375, + "learning_rate": 9.774677419354839e-05, + "loss": 0.1759, + "step": 1898 + }, + { + "epoch": 0.030384, + "grad_norm": 0.8359375, + "learning_rate": 9.774516129032259e-05, + "loss": 0.2089, + "step": 1899 + }, + { + "epoch": 0.0304, + "grad_norm": 1.6015625, + "learning_rate": 9.774354838709679e-05, + "loss": 0.1674, + "step": 1900 + }, + { + "epoch": 0.030416, + "grad_norm": 0.97265625, + "learning_rate": 9.774193548387098e-05, + "loss": 0.2352, + "step": 1901 + }, + { + "epoch": 0.030432, + "grad_norm": 0.58203125, + "learning_rate": 9.774032258064516e-05, + "loss": 0.1454, + "step": 1902 + }, + { + "epoch": 0.030448, + "grad_norm": 0.8203125, + "learning_rate": 9.773870967741936e-05, + "loss": 0.1559, + "step": 1903 + }, + { + "epoch": 0.030464, + "grad_norm": 0.734375, + "learning_rate": 9.773709677419355e-05, + "loss": 0.1634, + "step": 1904 + }, + { + "epoch": 0.03048, + "grad_norm": 0.92578125, + "learning_rate": 9.773548387096775e-05, + "loss": 0.2115, + "step": 1905 + }, + { + "epoch": 0.030496, + "grad_norm": 0.73046875, + "learning_rate": 9.773387096774193e-05, + "loss": 0.1558, + "step": 1906 + }, + { + "epoch": 0.030512, + "grad_norm": 0.75, + "learning_rate": 9.773225806451613e-05, + "loss": 0.1887, + "step": 1907 + }, + { + "epoch": 0.030528, + "grad_norm": 0.65234375, + "learning_rate": 9.773064516129032e-05, + "loss": 0.1702, + "step": 1908 + }, + { + "epoch": 0.030544, + "grad_norm": 1.4296875, + "learning_rate": 9.772903225806452e-05, + "loss": 0.2358, + "step": 1909 + }, + { + "epoch": 0.03056, + "grad_norm": 0.890625, + "learning_rate": 9.772741935483872e-05, + "loss": 0.1717, + "step": 1910 + }, + { + "epoch": 0.030576, + "grad_norm": 0.796875, + "learning_rate": 9.772580645161292e-05, + "loss": 0.1932, + "step": 1911 + }, + { + "epoch": 0.030592, + "grad_norm": 0.875, + "learning_rate": 9.77241935483871e-05, + "loss": 0.2144, + "step": 1912 + }, + { + "epoch": 0.030608, + "grad_norm": 1.0546875, + "learning_rate": 9.77225806451613e-05, + "loss": 0.2, + "step": 1913 + }, + { + "epoch": 0.030624, + "grad_norm": 1.2578125, + "learning_rate": 9.772096774193549e-05, + "loss": 0.2143, + "step": 1914 + }, + { + "epoch": 0.03064, + "grad_norm": 0.99609375, + "learning_rate": 9.771935483870969e-05, + "loss": 0.2166, + "step": 1915 + }, + { + "epoch": 0.030656, + "grad_norm": 1.59375, + "learning_rate": 9.771774193548387e-05, + "loss": 0.2578, + "step": 1916 + }, + { + "epoch": 0.030672, + "grad_norm": 1.2265625, + "learning_rate": 9.771612903225807e-05, + "loss": 0.2259, + "step": 1917 + }, + { + "epoch": 0.030688, + "grad_norm": 1.1015625, + "learning_rate": 9.771451612903226e-05, + "loss": 0.1907, + "step": 1918 + }, + { + "epoch": 0.030704, + "grad_norm": 0.96484375, + "learning_rate": 9.771290322580645e-05, + "loss": 0.2461, + "step": 1919 + }, + { + "epoch": 0.03072, + "grad_norm": 1.171875, + "learning_rate": 9.771129032258065e-05, + "loss": 0.2036, + "step": 1920 + }, + { + "epoch": 0.030736, + "grad_norm": 0.55859375, + "learning_rate": 9.770967741935485e-05, + "loss": 0.1889, + "step": 1921 + }, + { + "epoch": 0.030752, + "grad_norm": 1.0, + "learning_rate": 9.770806451612903e-05, + "loss": 0.2045, + "step": 1922 + }, + { + "epoch": 0.030768, + "grad_norm": 0.86328125, + "learning_rate": 9.770645161290323e-05, + "loss": 0.1916, + "step": 1923 + }, + { + "epoch": 0.030784, + "grad_norm": 0.94921875, + "learning_rate": 9.770483870967743e-05, + "loss": 0.2101, + "step": 1924 + }, + { + "epoch": 0.0308, + "grad_norm": 0.7890625, + "learning_rate": 9.770322580645162e-05, + "loss": 0.1791, + "step": 1925 + }, + { + "epoch": 0.030816, + "grad_norm": 0.98828125, + "learning_rate": 9.770161290322582e-05, + "loss": 0.2178, + "step": 1926 + }, + { + "epoch": 0.030832, + "grad_norm": 0.921875, + "learning_rate": 9.77e-05, + "loss": 0.1843, + "step": 1927 + }, + { + "epoch": 0.030848, + "grad_norm": 0.86328125, + "learning_rate": 9.76983870967742e-05, + "loss": 0.19, + "step": 1928 + }, + { + "epoch": 0.030864, + "grad_norm": 0.87890625, + "learning_rate": 9.769677419354839e-05, + "loss": 0.1871, + "step": 1929 + }, + { + "epoch": 0.03088, + "grad_norm": 0.97265625, + "learning_rate": 9.769516129032259e-05, + "loss": 0.1834, + "step": 1930 + }, + { + "epoch": 0.030896, + "grad_norm": 1.25, + "learning_rate": 9.769354838709677e-05, + "loss": 0.2097, + "step": 1931 + }, + { + "epoch": 0.030912, + "grad_norm": 1.5, + "learning_rate": 9.769193548387097e-05, + "loss": 0.1889, + "step": 1932 + }, + { + "epoch": 0.030928, + "grad_norm": 0.77734375, + "learning_rate": 9.769032258064516e-05, + "loss": 0.2051, + "step": 1933 + }, + { + "epoch": 0.030944, + "grad_norm": 0.984375, + "learning_rate": 9.768870967741936e-05, + "loss": 0.1385, + "step": 1934 + }, + { + "epoch": 0.03096, + "grad_norm": 1.375, + "learning_rate": 9.768709677419356e-05, + "loss": 0.2259, + "step": 1935 + }, + { + "epoch": 0.030976, + "grad_norm": 1.09375, + "learning_rate": 9.768548387096775e-05, + "loss": 0.1777, + "step": 1936 + }, + { + "epoch": 0.030992, + "grad_norm": 1.078125, + "learning_rate": 9.768387096774195e-05, + "loss": 0.2092, + "step": 1937 + }, + { + "epoch": 0.031008, + "grad_norm": 0.92578125, + "learning_rate": 9.768225806451613e-05, + "loss": 0.1559, + "step": 1938 + }, + { + "epoch": 0.031024, + "grad_norm": 0.90234375, + "learning_rate": 9.768064516129033e-05, + "loss": 0.212, + "step": 1939 + }, + { + "epoch": 0.03104, + "grad_norm": 1.109375, + "learning_rate": 9.767903225806452e-05, + "loss": 0.2009, + "step": 1940 + }, + { + "epoch": 0.031056, + "grad_norm": 0.73046875, + "learning_rate": 9.767741935483872e-05, + "loss": 0.1727, + "step": 1941 + }, + { + "epoch": 0.031072, + "grad_norm": 0.953125, + "learning_rate": 9.76758064516129e-05, + "loss": 0.2189, + "step": 1942 + }, + { + "epoch": 0.031088, + "grad_norm": 1.2265625, + "learning_rate": 9.76741935483871e-05, + "loss": 0.2364, + "step": 1943 + }, + { + "epoch": 0.031104, + "grad_norm": 0.765625, + "learning_rate": 9.767258064516129e-05, + "loss": 0.1594, + "step": 1944 + }, + { + "epoch": 0.03112, + "grad_norm": 0.91796875, + "learning_rate": 9.767096774193549e-05, + "loss": 0.2169, + "step": 1945 + }, + { + "epoch": 0.031136, + "grad_norm": 0.828125, + "learning_rate": 9.766935483870969e-05, + "loss": 0.1528, + "step": 1946 + }, + { + "epoch": 0.031152, + "grad_norm": 1.046875, + "learning_rate": 9.766774193548389e-05, + "loss": 0.1843, + "step": 1947 + }, + { + "epoch": 0.031168, + "grad_norm": 0.94921875, + "learning_rate": 9.766612903225807e-05, + "loss": 0.2302, + "step": 1948 + }, + { + "epoch": 0.031184, + "grad_norm": 1.1015625, + "learning_rate": 9.766451612903226e-05, + "loss": 0.2144, + "step": 1949 + }, + { + "epoch": 0.0312, + "grad_norm": 0.8359375, + "learning_rate": 9.766290322580646e-05, + "loss": 0.1927, + "step": 1950 + }, + { + "epoch": 0.031216, + "grad_norm": 0.83203125, + "learning_rate": 9.766129032258065e-05, + "loss": 0.1989, + "step": 1951 + }, + { + "epoch": 0.031232, + "grad_norm": 0.66796875, + "learning_rate": 9.765967741935484e-05, + "loss": 0.1613, + "step": 1952 + }, + { + "epoch": 0.031248, + "grad_norm": 0.95703125, + "learning_rate": 9.765806451612903e-05, + "loss": 0.1935, + "step": 1953 + }, + { + "epoch": 0.031264, + "grad_norm": 0.9765625, + "learning_rate": 9.765645161290323e-05, + "loss": 0.2023, + "step": 1954 + }, + { + "epoch": 0.03128, + "grad_norm": 1.2734375, + "learning_rate": 9.765483870967742e-05, + "loss": 0.2453, + "step": 1955 + }, + { + "epoch": 0.031296, + "grad_norm": 0.85546875, + "learning_rate": 9.765322580645162e-05, + "loss": 0.1856, + "step": 1956 + }, + { + "epoch": 0.031312, + "grad_norm": 1.015625, + "learning_rate": 9.76516129032258e-05, + "loss": 0.2178, + "step": 1957 + }, + { + "epoch": 0.031328, + "grad_norm": 0.77734375, + "learning_rate": 9.765e-05, + "loss": 0.2026, + "step": 1958 + }, + { + "epoch": 0.031344, + "grad_norm": 1.875, + "learning_rate": 9.76483870967742e-05, + "loss": 0.2333, + "step": 1959 + }, + { + "epoch": 0.03136, + "grad_norm": 1.1171875, + "learning_rate": 9.76467741935484e-05, + "loss": 0.1828, + "step": 1960 + }, + { + "epoch": 0.031376, + "grad_norm": 1.1484375, + "learning_rate": 9.764516129032259e-05, + "loss": 0.2002, + "step": 1961 + }, + { + "epoch": 0.031392, + "grad_norm": 1.015625, + "learning_rate": 9.764354838709679e-05, + "loss": 0.2265, + "step": 1962 + }, + { + "epoch": 0.031408, + "grad_norm": 1.1796875, + "learning_rate": 9.764193548387097e-05, + "loss": 0.2125, + "step": 1963 + }, + { + "epoch": 0.031424, + "grad_norm": 1.0390625, + "learning_rate": 9.764032258064516e-05, + "loss": 0.2489, + "step": 1964 + }, + { + "epoch": 0.03144, + "grad_norm": 0.75, + "learning_rate": 9.763870967741936e-05, + "loss": 0.1656, + "step": 1965 + }, + { + "epoch": 0.031456, + "grad_norm": 0.72265625, + "learning_rate": 9.763709677419354e-05, + "loss": 0.1529, + "step": 1966 + }, + { + "epoch": 0.031472, + "grad_norm": 0.7734375, + "learning_rate": 9.763548387096774e-05, + "loss": 0.2022, + "step": 1967 + }, + { + "epoch": 0.031488, + "grad_norm": 0.9375, + "learning_rate": 9.763387096774193e-05, + "loss": 0.2152, + "step": 1968 + }, + { + "epoch": 0.031504, + "grad_norm": 1.1875, + "learning_rate": 9.763225806451613e-05, + "loss": 0.2129, + "step": 1969 + }, + { + "epoch": 0.03152, + "grad_norm": 0.83984375, + "learning_rate": 9.763064516129033e-05, + "loss": 0.1971, + "step": 1970 + }, + { + "epoch": 0.031536, + "grad_norm": 0.890625, + "learning_rate": 9.762903225806453e-05, + "loss": 0.2058, + "step": 1971 + }, + { + "epoch": 0.031552, + "grad_norm": 1.5546875, + "learning_rate": 9.762741935483872e-05, + "loss": 0.2166, + "step": 1972 + }, + { + "epoch": 0.031568, + "grad_norm": 1.4609375, + "learning_rate": 9.762580645161292e-05, + "loss": 0.2445, + "step": 1973 + }, + { + "epoch": 0.031584, + "grad_norm": 0.9921875, + "learning_rate": 9.76241935483871e-05, + "loss": 0.164, + "step": 1974 + }, + { + "epoch": 0.0316, + "grad_norm": 0.9921875, + "learning_rate": 9.76225806451613e-05, + "loss": 0.22, + "step": 1975 + }, + { + "epoch": 0.031616, + "grad_norm": 0.83203125, + "learning_rate": 9.762096774193549e-05, + "loss": 0.2193, + "step": 1976 + }, + { + "epoch": 0.031632, + "grad_norm": 0.69921875, + "learning_rate": 9.761935483870969e-05, + "loss": 0.1975, + "step": 1977 + }, + { + "epoch": 0.031648, + "grad_norm": 0.7265625, + "learning_rate": 9.761774193548387e-05, + "loss": 0.1879, + "step": 1978 + }, + { + "epoch": 0.031664, + "grad_norm": 1.078125, + "learning_rate": 9.761612903225807e-05, + "loss": 0.1692, + "step": 1979 + }, + { + "epoch": 0.03168, + "grad_norm": 0.796875, + "learning_rate": 9.761451612903226e-05, + "loss": 0.1629, + "step": 1980 + }, + { + "epoch": 0.031696, + "grad_norm": 0.94921875, + "learning_rate": 9.761290322580646e-05, + "loss": 0.226, + "step": 1981 + }, + { + "epoch": 0.031712, + "grad_norm": 1.8828125, + "learning_rate": 9.761129032258066e-05, + "loss": 0.2011, + "step": 1982 + }, + { + "epoch": 0.031728, + "grad_norm": 0.671875, + "learning_rate": 9.760967741935484e-05, + "loss": 0.1988, + "step": 1983 + }, + { + "epoch": 0.031744, + "grad_norm": 0.87109375, + "learning_rate": 9.760806451612904e-05, + "loss": 0.2163, + "step": 1984 + }, + { + "epoch": 0.03176, + "grad_norm": 1.0, + "learning_rate": 9.760645161290323e-05, + "loss": 0.1845, + "step": 1985 + }, + { + "epoch": 0.031776, + "grad_norm": 0.93359375, + "learning_rate": 9.760483870967743e-05, + "loss": 0.1643, + "step": 1986 + }, + { + "epoch": 0.031792, + "grad_norm": 1.03125, + "learning_rate": 9.760322580645161e-05, + "loss": 0.2323, + "step": 1987 + }, + { + "epoch": 0.031808, + "grad_norm": 0.6484375, + "learning_rate": 9.760161290322581e-05, + "loss": 0.1602, + "step": 1988 + }, + { + "epoch": 0.031824, + "grad_norm": 0.8046875, + "learning_rate": 9.76e-05, + "loss": 0.1981, + "step": 1989 + }, + { + "epoch": 0.03184, + "grad_norm": 0.69921875, + "learning_rate": 9.75983870967742e-05, + "loss": 0.199, + "step": 1990 + }, + { + "epoch": 0.031856, + "grad_norm": 1.09375, + "learning_rate": 9.759677419354839e-05, + "loss": 0.154, + "step": 1991 + }, + { + "epoch": 0.031872, + "grad_norm": 0.9375, + "learning_rate": 9.759516129032259e-05, + "loss": 0.1965, + "step": 1992 + }, + { + "epoch": 0.031888, + "grad_norm": 1.234375, + "learning_rate": 9.759354838709677e-05, + "loss": 0.1941, + "step": 1993 + }, + { + "epoch": 0.031904, + "grad_norm": 1.2578125, + "learning_rate": 9.759193548387097e-05, + "loss": 0.2081, + "step": 1994 + }, + { + "epoch": 0.03192, + "grad_norm": 0.81640625, + "learning_rate": 9.759032258064517e-05, + "loss": 0.1667, + "step": 1995 + }, + { + "epoch": 0.031936, + "grad_norm": 0.85546875, + "learning_rate": 9.758870967741936e-05, + "loss": 0.1762, + "step": 1996 + }, + { + "epoch": 0.031952, + "grad_norm": 1.390625, + "learning_rate": 9.758709677419356e-05, + "loss": 0.1983, + "step": 1997 + }, + { + "epoch": 0.031968, + "grad_norm": 0.8984375, + "learning_rate": 9.758548387096774e-05, + "loss": 0.2065, + "step": 1998 + }, + { + "epoch": 0.031984, + "grad_norm": 1.0546875, + "learning_rate": 9.758387096774194e-05, + "loss": 0.1833, + "step": 1999 + }, + { + "epoch": 0.032, + "grad_norm": 1.0703125, + "learning_rate": 9.758225806451613e-05, + "loss": 0.205, + "step": 2000 + }, + { + "epoch": 0.032016, + "grad_norm": 0.96484375, + "learning_rate": 9.758064516129033e-05, + "loss": 0.2215, + "step": 2001 + }, + { + "epoch": 0.032032, + "grad_norm": 1.015625, + "learning_rate": 9.757903225806451e-05, + "loss": 0.1842, + "step": 2002 + }, + { + "epoch": 0.032048, + "grad_norm": 1.015625, + "learning_rate": 9.757741935483871e-05, + "loss": 0.1635, + "step": 2003 + }, + { + "epoch": 0.032064, + "grad_norm": 0.94140625, + "learning_rate": 9.75758064516129e-05, + "loss": 0.1536, + "step": 2004 + }, + { + "epoch": 0.03208, + "grad_norm": 1.0390625, + "learning_rate": 9.75741935483871e-05, + "loss": 0.1957, + "step": 2005 + }, + { + "epoch": 0.032096, + "grad_norm": 0.86328125, + "learning_rate": 9.75725806451613e-05, + "loss": 0.2031, + "step": 2006 + }, + { + "epoch": 0.032112, + "grad_norm": 0.80859375, + "learning_rate": 9.75709677419355e-05, + "loss": 0.2321, + "step": 2007 + }, + { + "epoch": 0.032128, + "grad_norm": 0.921875, + "learning_rate": 9.756935483870969e-05, + "loss": 0.2223, + "step": 2008 + }, + { + "epoch": 0.032144, + "grad_norm": 1.234375, + "learning_rate": 9.756774193548388e-05, + "loss": 0.1838, + "step": 2009 + }, + { + "epoch": 0.03216, + "grad_norm": 1.203125, + "learning_rate": 9.756612903225807e-05, + "loss": 0.2025, + "step": 2010 + }, + { + "epoch": 0.032176, + "grad_norm": 1.2421875, + "learning_rate": 9.756451612903226e-05, + "loss": 0.1928, + "step": 2011 + }, + { + "epoch": 0.032192, + "grad_norm": 0.97265625, + "learning_rate": 9.756290322580646e-05, + "loss": 0.2387, + "step": 2012 + }, + { + "epoch": 0.032208, + "grad_norm": 1.1015625, + "learning_rate": 9.756129032258064e-05, + "loss": 0.2149, + "step": 2013 + }, + { + "epoch": 0.032224, + "grad_norm": 1.015625, + "learning_rate": 9.755967741935484e-05, + "loss": 0.2356, + "step": 2014 + }, + { + "epoch": 0.03224, + "grad_norm": 0.7109375, + "learning_rate": 9.755806451612903e-05, + "loss": 0.2243, + "step": 2015 + }, + { + "epoch": 0.032256, + "grad_norm": 1.1796875, + "learning_rate": 9.755645161290323e-05, + "loss": 0.2539, + "step": 2016 + }, + { + "epoch": 0.032272, + "grad_norm": 1.0078125, + "learning_rate": 9.755483870967741e-05, + "loss": 0.196, + "step": 2017 + }, + { + "epoch": 0.032288, + "grad_norm": 1.1171875, + "learning_rate": 9.755322580645161e-05, + "loss": 0.2242, + "step": 2018 + }, + { + "epoch": 0.032304, + "grad_norm": 1.1171875, + "learning_rate": 9.755161290322581e-05, + "loss": 0.1993, + "step": 2019 + }, + { + "epoch": 0.03232, + "grad_norm": 0.68359375, + "learning_rate": 9.755000000000001e-05, + "loss": 0.1904, + "step": 2020 + }, + { + "epoch": 0.032336, + "grad_norm": 0.76953125, + "learning_rate": 9.75483870967742e-05, + "loss": 0.2103, + "step": 2021 + }, + { + "epoch": 0.032352, + "grad_norm": 0.875, + "learning_rate": 9.75467741935484e-05, + "loss": 0.1772, + "step": 2022 + }, + { + "epoch": 0.032368, + "grad_norm": 0.9375, + "learning_rate": 9.754516129032258e-05, + "loss": 0.2246, + "step": 2023 + }, + { + "epoch": 0.032384, + "grad_norm": 0.875, + "learning_rate": 9.754354838709678e-05, + "loss": 0.216, + "step": 2024 + }, + { + "epoch": 0.0324, + "grad_norm": 0.953125, + "learning_rate": 9.754193548387097e-05, + "loss": 0.1756, + "step": 2025 + }, + { + "epoch": 0.032416, + "grad_norm": 0.8203125, + "learning_rate": 9.754032258064517e-05, + "loss": 0.2004, + "step": 2026 + }, + { + "epoch": 0.032432, + "grad_norm": 0.796875, + "learning_rate": 9.753870967741936e-05, + "loss": 0.2363, + "step": 2027 + }, + { + "epoch": 0.032448, + "grad_norm": 0.734375, + "learning_rate": 9.753709677419354e-05, + "loss": 0.1596, + "step": 2028 + }, + { + "epoch": 0.032464, + "grad_norm": 1.09375, + "learning_rate": 9.753548387096774e-05, + "loss": 0.2426, + "step": 2029 + }, + { + "epoch": 0.03248, + "grad_norm": 0.93359375, + "learning_rate": 9.753387096774194e-05, + "loss": 0.2078, + "step": 2030 + }, + { + "epoch": 0.032496, + "grad_norm": 0.94921875, + "learning_rate": 9.753225806451614e-05, + "loss": 0.2201, + "step": 2031 + }, + { + "epoch": 0.032512, + "grad_norm": 0.96484375, + "learning_rate": 9.753064516129033e-05, + "loss": 0.2215, + "step": 2032 + }, + { + "epoch": 0.032528, + "grad_norm": 0.88671875, + "learning_rate": 9.752903225806453e-05, + "loss": 0.1746, + "step": 2033 + }, + { + "epoch": 0.032544, + "grad_norm": 1.140625, + "learning_rate": 9.752741935483871e-05, + "loss": 0.2281, + "step": 2034 + }, + { + "epoch": 0.03256, + "grad_norm": 1.0390625, + "learning_rate": 9.752580645161291e-05, + "loss": 0.1964, + "step": 2035 + }, + { + "epoch": 0.032576, + "grad_norm": 1.4375, + "learning_rate": 9.75241935483871e-05, + "loss": 0.1583, + "step": 2036 + }, + { + "epoch": 0.032592, + "grad_norm": 0.8515625, + "learning_rate": 9.75225806451613e-05, + "loss": 0.1989, + "step": 2037 + }, + { + "epoch": 0.032608, + "grad_norm": 0.72265625, + "learning_rate": 9.752096774193548e-05, + "loss": 0.1679, + "step": 2038 + }, + { + "epoch": 0.032624, + "grad_norm": 0.95703125, + "learning_rate": 9.751935483870968e-05, + "loss": 0.1838, + "step": 2039 + }, + { + "epoch": 0.03264, + "grad_norm": 1.125, + "learning_rate": 9.751774193548387e-05, + "loss": 0.2058, + "step": 2040 + }, + { + "epoch": 0.032656, + "grad_norm": 0.92578125, + "learning_rate": 9.751612903225807e-05, + "loss": 0.1723, + "step": 2041 + }, + { + "epoch": 0.032672, + "grad_norm": 0.5546875, + "learning_rate": 9.751451612903227e-05, + "loss": 0.1606, + "step": 2042 + }, + { + "epoch": 0.032688, + "grad_norm": 1.1640625, + "learning_rate": 9.751290322580646e-05, + "loss": 0.2094, + "step": 2043 + }, + { + "epoch": 0.032704, + "grad_norm": 1.28125, + "learning_rate": 9.751129032258066e-05, + "loss": 0.1674, + "step": 2044 + }, + { + "epoch": 0.03272, + "grad_norm": 0.9765625, + "learning_rate": 9.750967741935484e-05, + "loss": 0.2389, + "step": 2045 + }, + { + "epoch": 0.032736, + "grad_norm": 1.0546875, + "learning_rate": 9.750806451612904e-05, + "loss": 0.2439, + "step": 2046 + }, + { + "epoch": 0.032752, + "grad_norm": 1.71875, + "learning_rate": 9.750645161290323e-05, + "loss": 0.1983, + "step": 2047 + }, + { + "epoch": 0.032768, + "grad_norm": 0.765625, + "learning_rate": 9.750483870967743e-05, + "loss": 0.1848, + "step": 2048 + }, + { + "epoch": 0.032784, + "grad_norm": 1.359375, + "learning_rate": 9.750322580645161e-05, + "loss": 0.1878, + "step": 2049 + }, + { + "epoch": 0.0328, + "grad_norm": 0.86328125, + "learning_rate": 9.750161290322581e-05, + "loss": 0.1556, + "step": 2050 + }, + { + "epoch": 0.032816, + "grad_norm": 0.83984375, + "learning_rate": 9.75e-05, + "loss": 0.1593, + "step": 2051 + }, + { + "epoch": 0.032832, + "grad_norm": 1.0859375, + "learning_rate": 9.74983870967742e-05, + "loss": 0.2277, + "step": 2052 + }, + { + "epoch": 0.032848, + "grad_norm": 1.796875, + "learning_rate": 9.749677419354838e-05, + "loss": 0.1995, + "step": 2053 + }, + { + "epoch": 0.032864, + "grad_norm": 1.09375, + "learning_rate": 9.749516129032258e-05, + "loss": 0.2173, + "step": 2054 + }, + { + "epoch": 0.03288, + "grad_norm": 1.53125, + "learning_rate": 9.749354838709678e-05, + "loss": 0.1855, + "step": 2055 + }, + { + "epoch": 0.032896, + "grad_norm": 1.1640625, + "learning_rate": 9.749193548387098e-05, + "loss": 0.156, + "step": 2056 + }, + { + "epoch": 0.032912, + "grad_norm": 1.359375, + "learning_rate": 9.749032258064517e-05, + "loss": 0.1987, + "step": 2057 + }, + { + "epoch": 0.032928, + "grad_norm": 1.1640625, + "learning_rate": 9.748870967741935e-05, + "loss": 0.1887, + "step": 2058 + }, + { + "epoch": 0.032944, + "grad_norm": 0.80859375, + "learning_rate": 9.748709677419355e-05, + "loss": 0.1588, + "step": 2059 + }, + { + "epoch": 0.03296, + "grad_norm": 1.03125, + "learning_rate": 9.748548387096774e-05, + "loss": 0.2011, + "step": 2060 + }, + { + "epoch": 0.032976, + "grad_norm": 1.140625, + "learning_rate": 9.748387096774194e-05, + "loss": 0.2281, + "step": 2061 + }, + { + "epoch": 0.032992, + "grad_norm": 1.109375, + "learning_rate": 9.748225806451613e-05, + "loss": 0.1937, + "step": 2062 + }, + { + "epoch": 0.033008, + "grad_norm": 1.8203125, + "learning_rate": 9.748064516129033e-05, + "loss": 0.1721, + "step": 2063 + }, + { + "epoch": 0.033024, + "grad_norm": 1.71875, + "learning_rate": 9.747903225806451e-05, + "loss": 0.2199, + "step": 2064 + }, + { + "epoch": 0.03304, + "grad_norm": 1.625, + "learning_rate": 9.747741935483871e-05, + "loss": 0.2021, + "step": 2065 + }, + { + "epoch": 0.033056, + "grad_norm": 1.4140625, + "learning_rate": 9.747580645161291e-05, + "loss": 0.176, + "step": 2066 + }, + { + "epoch": 0.033072, + "grad_norm": 1.1796875, + "learning_rate": 9.747419354838711e-05, + "loss": 0.2232, + "step": 2067 + }, + { + "epoch": 0.033088, + "grad_norm": 1.1796875, + "learning_rate": 9.74725806451613e-05, + "loss": 0.1883, + "step": 2068 + }, + { + "epoch": 0.033104, + "grad_norm": 0.71484375, + "learning_rate": 9.74709677419355e-05, + "loss": 0.1794, + "step": 2069 + }, + { + "epoch": 0.03312, + "grad_norm": 1.546875, + "learning_rate": 9.746935483870968e-05, + "loss": 0.159, + "step": 2070 + }, + { + "epoch": 0.033136, + "grad_norm": 1.4140625, + "learning_rate": 9.746774193548388e-05, + "loss": 0.2019, + "step": 2071 + }, + { + "epoch": 0.033152, + "grad_norm": 1.5546875, + "learning_rate": 9.746612903225807e-05, + "loss": 0.2047, + "step": 2072 + }, + { + "epoch": 0.033168, + "grad_norm": 1.2890625, + "learning_rate": 9.746451612903225e-05, + "loss": 0.1579, + "step": 2073 + }, + { + "epoch": 0.033184, + "grad_norm": 1.265625, + "learning_rate": 9.746290322580645e-05, + "loss": 0.1315, + "step": 2074 + }, + { + "epoch": 0.0332, + "grad_norm": 1.140625, + "learning_rate": 9.746129032258064e-05, + "loss": 0.2163, + "step": 2075 + }, + { + "epoch": 0.033216, + "grad_norm": 0.578125, + "learning_rate": 9.745967741935484e-05, + "loss": 0.145, + "step": 2076 + }, + { + "epoch": 0.033232, + "grad_norm": 0.87109375, + "learning_rate": 9.745806451612904e-05, + "loss": 0.1976, + "step": 2077 + }, + { + "epoch": 0.033248, + "grad_norm": 1.125, + "learning_rate": 9.745645161290323e-05, + "loss": 0.2074, + "step": 2078 + }, + { + "epoch": 0.033264, + "grad_norm": 0.83203125, + "learning_rate": 9.745483870967743e-05, + "loss": 0.224, + "step": 2079 + }, + { + "epoch": 0.03328, + "grad_norm": 1.3828125, + "learning_rate": 9.745322580645162e-05, + "loss": 0.235, + "step": 2080 + }, + { + "epoch": 0.033296, + "grad_norm": 1.0703125, + "learning_rate": 9.745161290322581e-05, + "loss": 0.227, + "step": 2081 + }, + { + "epoch": 0.033312, + "grad_norm": 1.046875, + "learning_rate": 9.745000000000001e-05, + "loss": 0.1771, + "step": 2082 + }, + { + "epoch": 0.033328, + "grad_norm": 0.62109375, + "learning_rate": 9.74483870967742e-05, + "loss": 0.2028, + "step": 2083 + }, + { + "epoch": 0.033344, + "grad_norm": 1.296875, + "learning_rate": 9.74467741935484e-05, + "loss": 0.1773, + "step": 2084 + }, + { + "epoch": 0.03336, + "grad_norm": 0.796875, + "learning_rate": 9.744516129032258e-05, + "loss": 0.1975, + "step": 2085 + }, + { + "epoch": 0.033376, + "grad_norm": 1.1171875, + "learning_rate": 9.744354838709678e-05, + "loss": 0.1905, + "step": 2086 + }, + { + "epoch": 0.033392, + "grad_norm": 0.80859375, + "learning_rate": 9.744193548387097e-05, + "loss": 0.1974, + "step": 2087 + }, + { + "epoch": 0.033408, + "grad_norm": 0.92578125, + "learning_rate": 9.744032258064517e-05, + "loss": 0.1651, + "step": 2088 + }, + { + "epoch": 0.033424, + "grad_norm": 1.3046875, + "learning_rate": 9.743870967741935e-05, + "loss": 0.2508, + "step": 2089 + }, + { + "epoch": 0.03344, + "grad_norm": 0.671875, + "learning_rate": 9.743709677419355e-05, + "loss": 0.1808, + "step": 2090 + }, + { + "epoch": 0.033456, + "grad_norm": 1.8046875, + "learning_rate": 9.743548387096775e-05, + "loss": 0.2281, + "step": 2091 + }, + { + "epoch": 0.033472, + "grad_norm": 0.8359375, + "learning_rate": 9.743387096774194e-05, + "loss": 0.1717, + "step": 2092 + }, + { + "epoch": 0.033488, + "grad_norm": 1.2734375, + "learning_rate": 9.743225806451614e-05, + "loss": 0.2016, + "step": 2093 + }, + { + "epoch": 0.033504, + "grad_norm": 0.90234375, + "learning_rate": 9.743064516129032e-05, + "loss": 0.2019, + "step": 2094 + }, + { + "epoch": 0.03352, + "grad_norm": 1.0234375, + "learning_rate": 9.742903225806452e-05, + "loss": 0.1806, + "step": 2095 + }, + { + "epoch": 0.033536, + "grad_norm": 0.77734375, + "learning_rate": 9.742741935483871e-05, + "loss": 0.1484, + "step": 2096 + }, + { + "epoch": 0.033552, + "grad_norm": 0.9765625, + "learning_rate": 9.742580645161291e-05, + "loss": 0.1894, + "step": 2097 + }, + { + "epoch": 0.033568, + "grad_norm": 1.15625, + "learning_rate": 9.74241935483871e-05, + "loss": 0.1902, + "step": 2098 + }, + { + "epoch": 0.033584, + "grad_norm": 1.1953125, + "learning_rate": 9.74225806451613e-05, + "loss": 0.1555, + "step": 2099 + }, + { + "epoch": 0.0336, + "grad_norm": 1.0234375, + "learning_rate": 9.742096774193548e-05, + "loss": 0.1976, + "step": 2100 + }, + { + "epoch": 0.033616, + "grad_norm": 0.8515625, + "learning_rate": 9.741935483870968e-05, + "loss": 0.2096, + "step": 2101 + }, + { + "epoch": 0.033632, + "grad_norm": 0.98046875, + "learning_rate": 9.741774193548388e-05, + "loss": 0.2353, + "step": 2102 + }, + { + "epoch": 0.033648, + "grad_norm": 1.2890625, + "learning_rate": 9.741612903225808e-05, + "loss": 0.2078, + "step": 2103 + }, + { + "epoch": 0.033664, + "grad_norm": 1.4609375, + "learning_rate": 9.741451612903227e-05, + "loss": 0.2134, + "step": 2104 + }, + { + "epoch": 0.03368, + "grad_norm": 0.98828125, + "learning_rate": 9.741290322580645e-05, + "loss": 0.2218, + "step": 2105 + }, + { + "epoch": 0.033696, + "grad_norm": 1.3046875, + "learning_rate": 9.741129032258065e-05, + "loss": 0.2071, + "step": 2106 + }, + { + "epoch": 0.033712, + "grad_norm": 0.86328125, + "learning_rate": 9.740967741935484e-05, + "loss": 0.1955, + "step": 2107 + }, + { + "epoch": 0.033728, + "grad_norm": 1.078125, + "learning_rate": 9.740806451612904e-05, + "loss": 0.2102, + "step": 2108 + }, + { + "epoch": 0.033744, + "grad_norm": 0.71875, + "learning_rate": 9.740645161290322e-05, + "loss": 0.1681, + "step": 2109 + }, + { + "epoch": 0.03376, + "grad_norm": 0.91015625, + "learning_rate": 9.740483870967742e-05, + "loss": 0.2117, + "step": 2110 + }, + { + "epoch": 0.033776, + "grad_norm": 0.875, + "learning_rate": 9.740322580645161e-05, + "loss": 0.2132, + "step": 2111 + }, + { + "epoch": 0.033792, + "grad_norm": 0.890625, + "learning_rate": 9.740161290322581e-05, + "loss": 0.1871, + "step": 2112 + }, + { + "epoch": 0.033808, + "grad_norm": 0.69140625, + "learning_rate": 9.74e-05, + "loss": 0.1721, + "step": 2113 + }, + { + "epoch": 0.033824, + "grad_norm": 0.65234375, + "learning_rate": 9.73983870967742e-05, + "loss": 0.1701, + "step": 2114 + }, + { + "epoch": 0.03384, + "grad_norm": 1.03125, + "learning_rate": 9.73967741935484e-05, + "loss": 0.2045, + "step": 2115 + }, + { + "epoch": 0.033856, + "grad_norm": 0.87109375, + "learning_rate": 9.73951612903226e-05, + "loss": 0.1778, + "step": 2116 + }, + { + "epoch": 0.033872, + "grad_norm": 0.73046875, + "learning_rate": 9.739354838709678e-05, + "loss": 0.1597, + "step": 2117 + }, + { + "epoch": 0.033888, + "grad_norm": 0.75390625, + "learning_rate": 9.739193548387098e-05, + "loss": 0.1756, + "step": 2118 + }, + { + "epoch": 0.033904, + "grad_norm": 0.93359375, + "learning_rate": 9.739032258064517e-05, + "loss": 0.2152, + "step": 2119 + }, + { + "epoch": 0.03392, + "grad_norm": 0.859375, + "learning_rate": 9.738870967741935e-05, + "loss": 0.2031, + "step": 2120 + }, + { + "epoch": 0.033936, + "grad_norm": 0.62890625, + "learning_rate": 9.738709677419355e-05, + "loss": 0.181, + "step": 2121 + }, + { + "epoch": 0.033952, + "grad_norm": 0.71875, + "learning_rate": 9.738548387096774e-05, + "loss": 0.183, + "step": 2122 + }, + { + "epoch": 0.033968, + "grad_norm": 1.1484375, + "learning_rate": 9.738387096774194e-05, + "loss": 0.1897, + "step": 2123 + }, + { + "epoch": 0.033984, + "grad_norm": 0.9453125, + "learning_rate": 9.738225806451612e-05, + "loss": 0.1644, + "step": 2124 + }, + { + "epoch": 0.034, + "grad_norm": 1.0078125, + "learning_rate": 9.738064516129032e-05, + "loss": 0.1856, + "step": 2125 + }, + { + "epoch": 0.034016, + "grad_norm": 1.0078125, + "learning_rate": 9.737903225806452e-05, + "loss": 0.2074, + "step": 2126 + }, + { + "epoch": 0.034032, + "grad_norm": 1.5859375, + "learning_rate": 9.737741935483872e-05, + "loss": 0.1749, + "step": 2127 + }, + { + "epoch": 0.034048, + "grad_norm": 0.97265625, + "learning_rate": 9.737580645161291e-05, + "loss": 0.1727, + "step": 2128 + }, + { + "epoch": 0.034064, + "grad_norm": 0.86328125, + "learning_rate": 9.737419354838711e-05, + "loss": 0.1391, + "step": 2129 + }, + { + "epoch": 0.03408, + "grad_norm": 1.171875, + "learning_rate": 9.73725806451613e-05, + "loss": 0.1931, + "step": 2130 + }, + { + "epoch": 0.034096, + "grad_norm": 0.8046875, + "learning_rate": 9.73709677419355e-05, + "loss": 0.1804, + "step": 2131 + }, + { + "epoch": 0.034112, + "grad_norm": 0.76171875, + "learning_rate": 9.736935483870968e-05, + "loss": 0.1706, + "step": 2132 + }, + { + "epoch": 0.034128, + "grad_norm": 1.0078125, + "learning_rate": 9.736774193548388e-05, + "loss": 0.1769, + "step": 2133 + }, + { + "epoch": 0.034144, + "grad_norm": 1.4140625, + "learning_rate": 9.736612903225807e-05, + "loss": 0.2241, + "step": 2134 + }, + { + "epoch": 0.03416, + "grad_norm": 1.21875, + "learning_rate": 9.736451612903227e-05, + "loss": 0.1934, + "step": 2135 + }, + { + "epoch": 0.034176, + "grad_norm": 0.87890625, + "learning_rate": 9.736290322580645e-05, + "loss": 0.2069, + "step": 2136 + }, + { + "epoch": 0.034192, + "grad_norm": 0.8671875, + "learning_rate": 9.736129032258065e-05, + "loss": 0.2068, + "step": 2137 + }, + { + "epoch": 0.034208, + "grad_norm": 0.96875, + "learning_rate": 9.735967741935485e-05, + "loss": 0.2377, + "step": 2138 + }, + { + "epoch": 0.034224, + "grad_norm": 0.85546875, + "learning_rate": 9.735806451612904e-05, + "loss": 0.1862, + "step": 2139 + }, + { + "epoch": 0.03424, + "grad_norm": 1.4296875, + "learning_rate": 9.735645161290324e-05, + "loss": 0.2024, + "step": 2140 + }, + { + "epoch": 0.034256, + "grad_norm": 0.8671875, + "learning_rate": 9.735483870967742e-05, + "loss": 0.2032, + "step": 2141 + }, + { + "epoch": 0.034272, + "grad_norm": 1.5546875, + "learning_rate": 9.735322580645162e-05, + "loss": 0.1952, + "step": 2142 + }, + { + "epoch": 0.034288, + "grad_norm": 1.0390625, + "learning_rate": 9.735161290322581e-05, + "loss": 0.2133, + "step": 2143 + }, + { + "epoch": 0.034304, + "grad_norm": 0.73046875, + "learning_rate": 9.735000000000001e-05, + "loss": 0.1747, + "step": 2144 + }, + { + "epoch": 0.03432, + "grad_norm": 1.140625, + "learning_rate": 9.73483870967742e-05, + "loss": 0.2327, + "step": 2145 + }, + { + "epoch": 0.034336, + "grad_norm": 2.234375, + "learning_rate": 9.73467741935484e-05, + "loss": 0.2076, + "step": 2146 + }, + { + "epoch": 0.034352, + "grad_norm": 0.7734375, + "learning_rate": 9.734516129032258e-05, + "loss": 0.1872, + "step": 2147 + }, + { + "epoch": 0.034368, + "grad_norm": 0.93359375, + "learning_rate": 9.734354838709678e-05, + "loss": 0.1792, + "step": 2148 + }, + { + "epoch": 0.034384, + "grad_norm": 1.046875, + "learning_rate": 9.734193548387097e-05, + "loss": 0.2065, + "step": 2149 + }, + { + "epoch": 0.0344, + "grad_norm": 1.390625, + "learning_rate": 9.734032258064517e-05, + "loss": 0.216, + "step": 2150 + }, + { + "epoch": 0.034416, + "grad_norm": 0.7734375, + "learning_rate": 9.733870967741936e-05, + "loss": 0.2445, + "step": 2151 + }, + { + "epoch": 0.034432, + "grad_norm": 0.96484375, + "learning_rate": 9.733709677419355e-05, + "loss": 0.1546, + "step": 2152 + }, + { + "epoch": 0.034448, + "grad_norm": 0.92578125, + "learning_rate": 9.733548387096775e-05, + "loss": 0.1913, + "step": 2153 + }, + { + "epoch": 0.034464, + "grad_norm": 0.97265625, + "learning_rate": 9.733387096774194e-05, + "loss": 0.2102, + "step": 2154 + }, + { + "epoch": 0.03448, + "grad_norm": 0.9453125, + "learning_rate": 9.733225806451614e-05, + "loss": 0.2222, + "step": 2155 + }, + { + "epoch": 0.034496, + "grad_norm": 0.85546875, + "learning_rate": 9.733064516129032e-05, + "loss": 0.1946, + "step": 2156 + }, + { + "epoch": 0.034512, + "grad_norm": 0.8359375, + "learning_rate": 9.732903225806452e-05, + "loss": 0.1637, + "step": 2157 + }, + { + "epoch": 0.034528, + "grad_norm": 1.015625, + "learning_rate": 9.732741935483871e-05, + "loss": 0.1907, + "step": 2158 + }, + { + "epoch": 0.034544, + "grad_norm": 1.1640625, + "learning_rate": 9.732580645161291e-05, + "loss": 0.1863, + "step": 2159 + }, + { + "epoch": 0.03456, + "grad_norm": 1.0, + "learning_rate": 9.73241935483871e-05, + "loss": 0.215, + "step": 2160 + }, + { + "epoch": 0.034576, + "grad_norm": 0.765625, + "learning_rate": 9.73225806451613e-05, + "loss": 0.203, + "step": 2161 + }, + { + "epoch": 0.034592, + "grad_norm": 0.71484375, + "learning_rate": 9.732096774193549e-05, + "loss": 0.1661, + "step": 2162 + }, + { + "epoch": 0.034608, + "grad_norm": 0.9609375, + "learning_rate": 9.731935483870969e-05, + "loss": 0.2275, + "step": 2163 + }, + { + "epoch": 0.034624, + "grad_norm": 0.9609375, + "learning_rate": 9.731774193548388e-05, + "loss": 0.1818, + "step": 2164 + }, + { + "epoch": 0.03464, + "grad_norm": 1.0078125, + "learning_rate": 9.731612903225808e-05, + "loss": 0.2012, + "step": 2165 + }, + { + "epoch": 0.034656, + "grad_norm": 0.90234375, + "learning_rate": 9.731451612903226e-05, + "loss": 0.1978, + "step": 2166 + }, + { + "epoch": 0.034672, + "grad_norm": 1.1171875, + "learning_rate": 9.731290322580645e-05, + "loss": 0.2705, + "step": 2167 + }, + { + "epoch": 0.034688, + "grad_norm": 0.88671875, + "learning_rate": 9.731129032258065e-05, + "loss": 0.2219, + "step": 2168 + }, + { + "epoch": 0.034704, + "grad_norm": 0.859375, + "learning_rate": 9.730967741935484e-05, + "loss": 0.1719, + "step": 2169 + }, + { + "epoch": 0.03472, + "grad_norm": 1.3984375, + "learning_rate": 9.730806451612904e-05, + "loss": 0.1959, + "step": 2170 + }, + { + "epoch": 0.034736, + "grad_norm": 0.84765625, + "learning_rate": 9.730645161290322e-05, + "loss": 0.1889, + "step": 2171 + }, + { + "epoch": 0.034752, + "grad_norm": 0.78515625, + "learning_rate": 9.730483870967742e-05, + "loss": 0.1756, + "step": 2172 + }, + { + "epoch": 0.034768, + "grad_norm": 0.76953125, + "learning_rate": 9.730322580645162e-05, + "loss": 0.1685, + "step": 2173 + }, + { + "epoch": 0.034784, + "grad_norm": 0.6796875, + "learning_rate": 9.730161290322581e-05, + "loss": 0.1684, + "step": 2174 + }, + { + "epoch": 0.0348, + "grad_norm": 0.953125, + "learning_rate": 9.730000000000001e-05, + "loss": 0.1682, + "step": 2175 + }, + { + "epoch": 0.034816, + "grad_norm": 1.046875, + "learning_rate": 9.72983870967742e-05, + "loss": 0.1707, + "step": 2176 + }, + { + "epoch": 0.034832, + "grad_norm": 0.81640625, + "learning_rate": 9.729677419354839e-05, + "loss": 0.1866, + "step": 2177 + }, + { + "epoch": 0.034848, + "grad_norm": 0.8984375, + "learning_rate": 9.729516129032259e-05, + "loss": 0.1933, + "step": 2178 + }, + { + "epoch": 0.034864, + "grad_norm": 1.0234375, + "learning_rate": 9.729354838709678e-05, + "loss": 0.1845, + "step": 2179 + }, + { + "epoch": 0.03488, + "grad_norm": 0.8984375, + "learning_rate": 9.729193548387098e-05, + "loss": 0.2145, + "step": 2180 + }, + { + "epoch": 0.034896, + "grad_norm": 1.3671875, + "learning_rate": 9.729032258064516e-05, + "loss": 0.1553, + "step": 2181 + }, + { + "epoch": 0.034912, + "grad_norm": 1.1015625, + "learning_rate": 9.728870967741935e-05, + "loss": 0.2649, + "step": 2182 + }, + { + "epoch": 0.034928, + "grad_norm": 1.015625, + "learning_rate": 9.728709677419355e-05, + "loss": 0.2054, + "step": 2183 + }, + { + "epoch": 0.034944, + "grad_norm": 0.84765625, + "learning_rate": 9.728548387096774e-05, + "loss": 0.2313, + "step": 2184 + }, + { + "epoch": 0.03496, + "grad_norm": 1.015625, + "learning_rate": 9.728387096774194e-05, + "loss": 0.16, + "step": 2185 + }, + { + "epoch": 0.034976, + "grad_norm": 1.2109375, + "learning_rate": 9.728225806451614e-05, + "loss": 0.2665, + "step": 2186 + }, + { + "epoch": 0.034992, + "grad_norm": 0.98046875, + "learning_rate": 9.728064516129033e-05, + "loss": 0.214, + "step": 2187 + }, + { + "epoch": 0.035008, + "grad_norm": 0.90625, + "learning_rate": 9.727903225806452e-05, + "loss": 0.1635, + "step": 2188 + }, + { + "epoch": 0.035024, + "grad_norm": 1.0546875, + "learning_rate": 9.727741935483872e-05, + "loss": 0.1924, + "step": 2189 + }, + { + "epoch": 0.03504, + "grad_norm": 1.1953125, + "learning_rate": 9.72758064516129e-05, + "loss": 0.1911, + "step": 2190 + }, + { + "epoch": 0.035056, + "grad_norm": 0.87109375, + "learning_rate": 9.72741935483871e-05, + "loss": 0.1834, + "step": 2191 + }, + { + "epoch": 0.035072, + "grad_norm": 0.90234375, + "learning_rate": 9.727258064516129e-05, + "loss": 0.2036, + "step": 2192 + }, + { + "epoch": 0.035088, + "grad_norm": 1.0390625, + "learning_rate": 9.727096774193549e-05, + "loss": 0.1809, + "step": 2193 + }, + { + "epoch": 0.035104, + "grad_norm": 1.109375, + "learning_rate": 9.726935483870968e-05, + "loss": 0.2205, + "step": 2194 + }, + { + "epoch": 0.03512, + "grad_norm": 0.9609375, + "learning_rate": 9.726774193548388e-05, + "loss": 0.1807, + "step": 2195 + }, + { + "epoch": 0.035136, + "grad_norm": 1.515625, + "learning_rate": 9.726612903225806e-05, + "loss": 0.2638, + "step": 2196 + }, + { + "epoch": 0.035152, + "grad_norm": 1.0703125, + "learning_rate": 9.726451612903226e-05, + "loss": 0.1847, + "step": 2197 + }, + { + "epoch": 0.035168, + "grad_norm": 1.3046875, + "learning_rate": 9.726290322580646e-05, + "loss": 0.2419, + "step": 2198 + }, + { + "epoch": 0.035184, + "grad_norm": 0.875, + "learning_rate": 9.726129032258065e-05, + "loss": 0.1964, + "step": 2199 + }, + { + "epoch": 0.0352, + "grad_norm": 1.2734375, + "learning_rate": 9.725967741935485e-05, + "loss": 0.2166, + "step": 2200 + }, + { + "epoch": 0.035216, + "grad_norm": 1.046875, + "learning_rate": 9.725806451612903e-05, + "loss": 0.1708, + "step": 2201 + }, + { + "epoch": 0.035232, + "grad_norm": 0.99609375, + "learning_rate": 9.725645161290323e-05, + "loss": 0.1732, + "step": 2202 + }, + { + "epoch": 0.035248, + "grad_norm": 0.8984375, + "learning_rate": 9.725483870967742e-05, + "loss": 0.2056, + "step": 2203 + }, + { + "epoch": 0.035264, + "grad_norm": 0.765625, + "learning_rate": 9.725322580645162e-05, + "loss": 0.1935, + "step": 2204 + }, + { + "epoch": 0.03528, + "grad_norm": 0.859375, + "learning_rate": 9.72516129032258e-05, + "loss": 0.1703, + "step": 2205 + }, + { + "epoch": 0.035296, + "grad_norm": 0.6328125, + "learning_rate": 9.725e-05, + "loss": 0.1744, + "step": 2206 + }, + { + "epoch": 0.035312, + "grad_norm": 1.4296875, + "learning_rate": 9.724838709677419e-05, + "loss": 0.2041, + "step": 2207 + }, + { + "epoch": 0.035328, + "grad_norm": 0.76953125, + "learning_rate": 9.724677419354839e-05, + "loss": 0.2213, + "step": 2208 + }, + { + "epoch": 0.035344, + "grad_norm": 0.81640625, + "learning_rate": 9.724516129032258e-05, + "loss": 0.1872, + "step": 2209 + }, + { + "epoch": 0.03536, + "grad_norm": 0.82421875, + "learning_rate": 9.724354838709678e-05, + "loss": 0.2099, + "step": 2210 + }, + { + "epoch": 0.035376, + "grad_norm": 1.3984375, + "learning_rate": 9.724193548387098e-05, + "loss": 0.2435, + "step": 2211 + }, + { + "epoch": 0.035392, + "grad_norm": 0.82421875, + "learning_rate": 9.724032258064518e-05, + "loss": 0.2296, + "step": 2212 + }, + { + "epoch": 0.035408, + "grad_norm": 0.7265625, + "learning_rate": 9.723870967741936e-05, + "loss": 0.1899, + "step": 2213 + }, + { + "epoch": 0.035424, + "grad_norm": 0.7109375, + "learning_rate": 9.723709677419355e-05, + "loss": 0.161, + "step": 2214 + }, + { + "epoch": 0.03544, + "grad_norm": 0.83203125, + "learning_rate": 9.723548387096775e-05, + "loss": 0.2, + "step": 2215 + }, + { + "epoch": 0.035456, + "grad_norm": 0.875, + "learning_rate": 9.723387096774193e-05, + "loss": 0.1617, + "step": 2216 + }, + { + "epoch": 0.035472, + "grad_norm": 1.265625, + "learning_rate": 9.723225806451613e-05, + "loss": 0.2294, + "step": 2217 + }, + { + "epoch": 0.035488, + "grad_norm": 0.95703125, + "learning_rate": 9.723064516129032e-05, + "loss": 0.2131, + "step": 2218 + }, + { + "epoch": 0.035504, + "grad_norm": 0.734375, + "learning_rate": 9.722903225806452e-05, + "loss": 0.1908, + "step": 2219 + }, + { + "epoch": 0.03552, + "grad_norm": 0.65625, + "learning_rate": 9.72274193548387e-05, + "loss": 0.1713, + "step": 2220 + }, + { + "epoch": 0.035536, + "grad_norm": 0.95703125, + "learning_rate": 9.72258064516129e-05, + "loss": 0.2235, + "step": 2221 + }, + { + "epoch": 0.035552, + "grad_norm": 1.09375, + "learning_rate": 9.72241935483871e-05, + "loss": 0.1862, + "step": 2222 + }, + { + "epoch": 0.035568, + "grad_norm": 1.0078125, + "learning_rate": 9.72225806451613e-05, + "loss": 0.1747, + "step": 2223 + }, + { + "epoch": 0.035584, + "grad_norm": 1.0390625, + "learning_rate": 9.722096774193549e-05, + "loss": 0.2023, + "step": 2224 + }, + { + "epoch": 0.0356, + "grad_norm": 0.77734375, + "learning_rate": 9.721935483870969e-05, + "loss": 0.1633, + "step": 2225 + }, + { + "epoch": 0.035616, + "grad_norm": 0.98828125, + "learning_rate": 9.721774193548388e-05, + "loss": 0.1629, + "step": 2226 + }, + { + "epoch": 0.035632, + "grad_norm": 0.9140625, + "learning_rate": 9.721612903225808e-05, + "loss": 0.2071, + "step": 2227 + }, + { + "epoch": 0.035648, + "grad_norm": 0.8203125, + "learning_rate": 9.721451612903226e-05, + "loss": 0.1783, + "step": 2228 + }, + { + "epoch": 0.035664, + "grad_norm": 0.875, + "learning_rate": 9.721290322580645e-05, + "loss": 0.2286, + "step": 2229 + }, + { + "epoch": 0.03568, + "grad_norm": 1.4453125, + "learning_rate": 9.721129032258065e-05, + "loss": 0.2056, + "step": 2230 + }, + { + "epoch": 0.035696, + "grad_norm": 1.2265625, + "learning_rate": 9.720967741935483e-05, + "loss": 0.1879, + "step": 2231 + }, + { + "epoch": 0.035712, + "grad_norm": 0.92578125, + "learning_rate": 9.720806451612903e-05, + "loss": 0.1975, + "step": 2232 + }, + { + "epoch": 0.035728, + "grad_norm": 0.86328125, + "learning_rate": 9.720645161290323e-05, + "loss": 0.1476, + "step": 2233 + }, + { + "epoch": 0.035744, + "grad_norm": 1.078125, + "learning_rate": 9.720483870967743e-05, + "loss": 0.2446, + "step": 2234 + }, + { + "epoch": 0.03576, + "grad_norm": 1.4609375, + "learning_rate": 9.720322580645162e-05, + "loss": 0.2535, + "step": 2235 + }, + { + "epoch": 0.035776, + "grad_norm": 0.96484375, + "learning_rate": 9.720161290322582e-05, + "loss": 0.2031, + "step": 2236 + }, + { + "epoch": 0.035792, + "grad_norm": 0.83984375, + "learning_rate": 9.72e-05, + "loss": 0.1985, + "step": 2237 + }, + { + "epoch": 0.035808, + "grad_norm": 0.890625, + "learning_rate": 9.71983870967742e-05, + "loss": 0.2132, + "step": 2238 + }, + { + "epoch": 0.035824, + "grad_norm": 0.8125, + "learning_rate": 9.719677419354839e-05, + "loss": 0.2219, + "step": 2239 + }, + { + "epoch": 0.03584, + "grad_norm": 0.98828125, + "learning_rate": 9.719516129032259e-05, + "loss": 0.2198, + "step": 2240 + }, + { + "epoch": 0.035856, + "grad_norm": 0.8125, + "learning_rate": 9.719354838709678e-05, + "loss": 0.1651, + "step": 2241 + }, + { + "epoch": 0.035872, + "grad_norm": 1.0859375, + "learning_rate": 9.719193548387098e-05, + "loss": 0.1995, + "step": 2242 + }, + { + "epoch": 0.035888, + "grad_norm": 0.6875, + "learning_rate": 9.719032258064516e-05, + "loss": 0.1564, + "step": 2243 + }, + { + "epoch": 0.035904, + "grad_norm": 0.9921875, + "learning_rate": 9.718870967741935e-05, + "loss": 0.1715, + "step": 2244 + }, + { + "epoch": 0.03592, + "grad_norm": 1.0625, + "learning_rate": 9.718709677419355e-05, + "loss": 0.1948, + "step": 2245 + }, + { + "epoch": 0.035936, + "grad_norm": 0.90625, + "learning_rate": 9.718548387096775e-05, + "loss": 0.1917, + "step": 2246 + }, + { + "epoch": 0.035952, + "grad_norm": 1.5078125, + "learning_rate": 9.718387096774195e-05, + "loss": 0.2145, + "step": 2247 + }, + { + "epoch": 0.035968, + "grad_norm": 0.8515625, + "learning_rate": 9.718225806451613e-05, + "loss": 0.1937, + "step": 2248 + }, + { + "epoch": 0.035984, + "grad_norm": 0.8125, + "learning_rate": 9.718064516129033e-05, + "loss": 0.2006, + "step": 2249 + }, + { + "epoch": 0.036, + "grad_norm": 1.1953125, + "learning_rate": 9.717903225806452e-05, + "loss": 0.218, + "step": 2250 + }, + { + "epoch": 0.036016, + "grad_norm": 0.98046875, + "learning_rate": 9.717741935483872e-05, + "loss": 0.173, + "step": 2251 + }, + { + "epoch": 0.036032, + "grad_norm": 0.953125, + "learning_rate": 9.71758064516129e-05, + "loss": 0.2018, + "step": 2252 + }, + { + "epoch": 0.036048, + "grad_norm": 0.65625, + "learning_rate": 9.71741935483871e-05, + "loss": 0.1463, + "step": 2253 + }, + { + "epoch": 0.036064, + "grad_norm": 1.1171875, + "learning_rate": 9.717258064516129e-05, + "loss": 0.1746, + "step": 2254 + }, + { + "epoch": 0.03608, + "grad_norm": 0.8203125, + "learning_rate": 9.717096774193549e-05, + "loss": 0.195, + "step": 2255 + }, + { + "epoch": 0.036096, + "grad_norm": 1.0, + "learning_rate": 9.716935483870968e-05, + "loss": 0.1875, + "step": 2256 + }, + { + "epoch": 0.036112, + "grad_norm": 0.97265625, + "learning_rate": 9.716774193548388e-05, + "loss": 0.1831, + "step": 2257 + }, + { + "epoch": 0.036128, + "grad_norm": 0.8671875, + "learning_rate": 9.716612903225807e-05, + "loss": 0.1994, + "step": 2258 + }, + { + "epoch": 0.036144, + "grad_norm": 0.70703125, + "learning_rate": 9.716451612903227e-05, + "loss": 0.1751, + "step": 2259 + }, + { + "epoch": 0.03616, + "grad_norm": 1.2578125, + "learning_rate": 9.716290322580646e-05, + "loss": 0.1838, + "step": 2260 + }, + { + "epoch": 0.036176, + "grad_norm": 0.75390625, + "learning_rate": 9.716129032258065e-05, + "loss": 0.1447, + "step": 2261 + }, + { + "epoch": 0.036192, + "grad_norm": 1.3203125, + "learning_rate": 9.715967741935485e-05, + "loss": 0.1981, + "step": 2262 + }, + { + "epoch": 0.036208, + "grad_norm": 0.8984375, + "learning_rate": 9.715806451612903e-05, + "loss": 0.2462, + "step": 2263 + }, + { + "epoch": 0.036224, + "grad_norm": 0.8359375, + "learning_rate": 9.715645161290323e-05, + "loss": 0.1475, + "step": 2264 + }, + { + "epoch": 0.03624, + "grad_norm": 0.8828125, + "learning_rate": 9.715483870967742e-05, + "loss": 0.2098, + "step": 2265 + }, + { + "epoch": 0.036256, + "grad_norm": 0.9375, + "learning_rate": 9.715322580645162e-05, + "loss": 0.2352, + "step": 2266 + }, + { + "epoch": 0.036272, + "grad_norm": 1.0546875, + "learning_rate": 9.71516129032258e-05, + "loss": 0.205, + "step": 2267 + }, + { + "epoch": 0.036288, + "grad_norm": 1.0625, + "learning_rate": 9.715e-05, + "loss": 0.2139, + "step": 2268 + }, + { + "epoch": 0.036304, + "grad_norm": 0.85546875, + "learning_rate": 9.714838709677419e-05, + "loss": 0.1875, + "step": 2269 + }, + { + "epoch": 0.03632, + "grad_norm": 1.1484375, + "learning_rate": 9.714677419354839e-05, + "loss": 0.1849, + "step": 2270 + }, + { + "epoch": 0.036336, + "grad_norm": 1.265625, + "learning_rate": 9.714516129032259e-05, + "loss": 0.2253, + "step": 2271 + }, + { + "epoch": 0.036352, + "grad_norm": 1.03125, + "learning_rate": 9.714354838709679e-05, + "loss": 0.1904, + "step": 2272 + }, + { + "epoch": 0.036368, + "grad_norm": 0.69140625, + "learning_rate": 9.714193548387097e-05, + "loss": 0.1877, + "step": 2273 + }, + { + "epoch": 0.036384, + "grad_norm": 0.62890625, + "learning_rate": 9.714032258064517e-05, + "loss": 0.1497, + "step": 2274 + }, + { + "epoch": 0.0364, + "grad_norm": 0.89453125, + "learning_rate": 9.713870967741936e-05, + "loss": 0.1804, + "step": 2275 + }, + { + "epoch": 0.036416, + "grad_norm": 1.1875, + "learning_rate": 9.713709677419355e-05, + "loss": 0.2069, + "step": 2276 + }, + { + "epoch": 0.036432, + "grad_norm": 1.3984375, + "learning_rate": 9.713548387096775e-05, + "loss": 0.2147, + "step": 2277 + }, + { + "epoch": 0.036448, + "grad_norm": 0.87109375, + "learning_rate": 9.713387096774193e-05, + "loss": 0.2043, + "step": 2278 + }, + { + "epoch": 0.036464, + "grad_norm": 0.78125, + "learning_rate": 9.713225806451613e-05, + "loss": 0.1765, + "step": 2279 + }, + { + "epoch": 0.03648, + "grad_norm": 0.8828125, + "learning_rate": 9.713064516129032e-05, + "loss": 0.1953, + "step": 2280 + }, + { + "epoch": 0.036496, + "grad_norm": 0.97265625, + "learning_rate": 9.712903225806452e-05, + "loss": 0.1504, + "step": 2281 + }, + { + "epoch": 0.036512, + "grad_norm": 0.640625, + "learning_rate": 9.712741935483872e-05, + "loss": 0.1993, + "step": 2282 + }, + { + "epoch": 0.036528, + "grad_norm": 1.515625, + "learning_rate": 9.712580645161292e-05, + "loss": 0.2225, + "step": 2283 + }, + { + "epoch": 0.036544, + "grad_norm": 1.5, + "learning_rate": 9.71241935483871e-05, + "loss": 0.1992, + "step": 2284 + }, + { + "epoch": 0.03656, + "grad_norm": 0.61328125, + "learning_rate": 9.71225806451613e-05, + "loss": 0.1778, + "step": 2285 + }, + { + "epoch": 0.036576, + "grad_norm": 0.9609375, + "learning_rate": 9.712096774193549e-05, + "loss": 0.2085, + "step": 2286 + }, + { + "epoch": 0.036592, + "grad_norm": 0.75390625, + "learning_rate": 9.711935483870969e-05, + "loss": 0.1665, + "step": 2287 + }, + { + "epoch": 0.036608, + "grad_norm": 1.3125, + "learning_rate": 9.711774193548387e-05, + "loss": 0.2373, + "step": 2288 + }, + { + "epoch": 0.036624, + "grad_norm": 1.046875, + "learning_rate": 9.711612903225807e-05, + "loss": 0.1942, + "step": 2289 + }, + { + "epoch": 0.03664, + "grad_norm": 1.3359375, + "learning_rate": 9.711451612903226e-05, + "loss": 0.1829, + "step": 2290 + }, + { + "epoch": 0.036656, + "grad_norm": 1.203125, + "learning_rate": 9.711290322580645e-05, + "loss": 0.2259, + "step": 2291 + }, + { + "epoch": 0.036672, + "grad_norm": 0.8828125, + "learning_rate": 9.711129032258065e-05, + "loss": 0.1934, + "step": 2292 + }, + { + "epoch": 0.036688, + "grad_norm": 0.796875, + "learning_rate": 9.710967741935484e-05, + "loss": 0.2188, + "step": 2293 + }, + { + "epoch": 0.036704, + "grad_norm": 0.98828125, + "learning_rate": 9.710806451612904e-05, + "loss": 0.191, + "step": 2294 + }, + { + "epoch": 0.03672, + "grad_norm": 1.484375, + "learning_rate": 9.710645161290323e-05, + "loss": 0.1769, + "step": 2295 + }, + { + "epoch": 0.036736, + "grad_norm": 0.71875, + "learning_rate": 9.710483870967743e-05, + "loss": 0.1947, + "step": 2296 + }, + { + "epoch": 0.036752, + "grad_norm": 1.3671875, + "learning_rate": 9.710322580645162e-05, + "loss": 0.1791, + "step": 2297 + }, + { + "epoch": 0.036768, + "grad_norm": 1.59375, + "learning_rate": 9.710161290322582e-05, + "loss": 0.2474, + "step": 2298 + }, + { + "epoch": 0.036784, + "grad_norm": 1.0859375, + "learning_rate": 9.71e-05, + "loss": 0.1998, + "step": 2299 + }, + { + "epoch": 0.0368, + "grad_norm": 1.046875, + "learning_rate": 9.70983870967742e-05, + "loss": 0.202, + "step": 2300 + }, + { + "epoch": 0.036816, + "grad_norm": 0.98828125, + "learning_rate": 9.709677419354839e-05, + "loss": 0.1826, + "step": 2301 + }, + { + "epoch": 0.036832, + "grad_norm": 1.4921875, + "learning_rate": 9.709516129032259e-05, + "loss": 0.193, + "step": 2302 + }, + { + "epoch": 0.036848, + "grad_norm": 1.359375, + "learning_rate": 9.709354838709677e-05, + "loss": 0.1974, + "step": 2303 + }, + { + "epoch": 0.036864, + "grad_norm": 1.8203125, + "learning_rate": 9.709193548387097e-05, + "loss": 0.1927, + "step": 2304 + }, + { + "epoch": 0.03688, + "grad_norm": 2.046875, + "learning_rate": 9.709032258064516e-05, + "loss": 0.1981, + "step": 2305 + }, + { + "epoch": 0.036896, + "grad_norm": 1.84375, + "learning_rate": 9.708870967741936e-05, + "loss": 0.2049, + "step": 2306 + }, + { + "epoch": 0.036912, + "grad_norm": 0.91796875, + "learning_rate": 9.708709677419356e-05, + "loss": 0.1887, + "step": 2307 + }, + { + "epoch": 0.036928, + "grad_norm": 0.9453125, + "learning_rate": 9.708548387096774e-05, + "loss": 0.189, + "step": 2308 + }, + { + "epoch": 0.036944, + "grad_norm": 1.5703125, + "learning_rate": 9.708387096774194e-05, + "loss": 0.1589, + "step": 2309 + }, + { + "epoch": 0.03696, + "grad_norm": 0.74609375, + "learning_rate": 9.708225806451613e-05, + "loss": 0.1668, + "step": 2310 + }, + { + "epoch": 0.036976, + "grad_norm": 1.1875, + "learning_rate": 9.708064516129033e-05, + "loss": 0.2271, + "step": 2311 + }, + { + "epoch": 0.036992, + "grad_norm": 0.9609375, + "learning_rate": 9.707903225806452e-05, + "loss": 0.1996, + "step": 2312 + }, + { + "epoch": 0.037008, + "grad_norm": 0.953125, + "learning_rate": 9.707741935483872e-05, + "loss": 0.2104, + "step": 2313 + }, + { + "epoch": 0.037024, + "grad_norm": 0.90234375, + "learning_rate": 9.70758064516129e-05, + "loss": 0.1646, + "step": 2314 + }, + { + "epoch": 0.03704, + "grad_norm": 1.125, + "learning_rate": 9.70741935483871e-05, + "loss": 0.1978, + "step": 2315 + }, + { + "epoch": 0.037056, + "grad_norm": 0.75, + "learning_rate": 9.707258064516129e-05, + "loss": 0.1408, + "step": 2316 + }, + { + "epoch": 0.037072, + "grad_norm": 0.98046875, + "learning_rate": 9.707096774193549e-05, + "loss": 0.1937, + "step": 2317 + }, + { + "epoch": 0.037088, + "grad_norm": 1.0625, + "learning_rate": 9.706935483870969e-05, + "loss": 0.2319, + "step": 2318 + }, + { + "epoch": 0.037104, + "grad_norm": 1.2578125, + "learning_rate": 9.706774193548389e-05, + "loss": 0.2055, + "step": 2319 + }, + { + "epoch": 0.03712, + "grad_norm": 1.03125, + "learning_rate": 9.706612903225807e-05, + "loss": 0.1895, + "step": 2320 + }, + { + "epoch": 0.037136, + "grad_norm": 0.9453125, + "learning_rate": 9.706451612903227e-05, + "loss": 0.1971, + "step": 2321 + }, + { + "epoch": 0.037152, + "grad_norm": 0.90625, + "learning_rate": 9.706290322580646e-05, + "loss": 0.2147, + "step": 2322 + }, + { + "epoch": 0.037168, + "grad_norm": 0.9765625, + "learning_rate": 9.706129032258064e-05, + "loss": 0.1723, + "step": 2323 + }, + { + "epoch": 0.037184, + "grad_norm": 0.92578125, + "learning_rate": 9.705967741935484e-05, + "loss": 0.1979, + "step": 2324 + }, + { + "epoch": 0.0372, + "grad_norm": 0.97265625, + "learning_rate": 9.705806451612903e-05, + "loss": 0.1993, + "step": 2325 + }, + { + "epoch": 0.037216, + "grad_norm": 1.1328125, + "learning_rate": 9.705645161290323e-05, + "loss": 0.2292, + "step": 2326 + }, + { + "epoch": 0.037232, + "grad_norm": 0.87109375, + "learning_rate": 9.705483870967742e-05, + "loss": 0.1966, + "step": 2327 + }, + { + "epoch": 0.037248, + "grad_norm": 1.015625, + "learning_rate": 9.705322580645162e-05, + "loss": 0.2055, + "step": 2328 + }, + { + "epoch": 0.037264, + "grad_norm": 1.0546875, + "learning_rate": 9.705161290322581e-05, + "loss": 0.1769, + "step": 2329 + }, + { + "epoch": 0.03728, + "grad_norm": 1.2265625, + "learning_rate": 9.705e-05, + "loss": 0.2029, + "step": 2330 + }, + { + "epoch": 0.037296, + "grad_norm": 0.87109375, + "learning_rate": 9.70483870967742e-05, + "loss": 0.2335, + "step": 2331 + }, + { + "epoch": 0.037312, + "grad_norm": 0.9453125, + "learning_rate": 9.70467741935484e-05, + "loss": 0.2112, + "step": 2332 + }, + { + "epoch": 0.037328, + "grad_norm": 0.85546875, + "learning_rate": 9.704516129032259e-05, + "loss": 0.1765, + "step": 2333 + }, + { + "epoch": 0.037344, + "grad_norm": 1.0703125, + "learning_rate": 9.704354838709679e-05, + "loss": 0.2104, + "step": 2334 + }, + { + "epoch": 0.03736, + "grad_norm": 0.84375, + "learning_rate": 9.704193548387097e-05, + "loss": 0.2029, + "step": 2335 + }, + { + "epoch": 0.037376, + "grad_norm": 0.890625, + "learning_rate": 9.704032258064517e-05, + "loss": 0.1834, + "step": 2336 + }, + { + "epoch": 0.037392, + "grad_norm": 1.234375, + "learning_rate": 9.703870967741936e-05, + "loss": 0.206, + "step": 2337 + }, + { + "epoch": 0.037408, + "grad_norm": 0.6796875, + "learning_rate": 9.703709677419354e-05, + "loss": 0.1597, + "step": 2338 + }, + { + "epoch": 0.037424, + "grad_norm": 1.0546875, + "learning_rate": 9.703548387096774e-05, + "loss": 0.2291, + "step": 2339 + }, + { + "epoch": 0.03744, + "grad_norm": 0.84375, + "learning_rate": 9.703387096774193e-05, + "loss": 0.2167, + "step": 2340 + }, + { + "epoch": 0.037456, + "grad_norm": 1.4296875, + "learning_rate": 9.703225806451613e-05, + "loss": 0.2098, + "step": 2341 + }, + { + "epoch": 0.037472, + "grad_norm": 1.3984375, + "learning_rate": 9.703064516129033e-05, + "loss": 0.2172, + "step": 2342 + }, + { + "epoch": 0.037488, + "grad_norm": 1.2890625, + "learning_rate": 9.702903225806453e-05, + "loss": 0.2302, + "step": 2343 + }, + { + "epoch": 0.037504, + "grad_norm": 1.078125, + "learning_rate": 9.702741935483871e-05, + "loss": 0.2134, + "step": 2344 + }, + { + "epoch": 0.03752, + "grad_norm": 1.28125, + "learning_rate": 9.702580645161291e-05, + "loss": 0.2139, + "step": 2345 + }, + { + "epoch": 0.037536, + "grad_norm": 0.8671875, + "learning_rate": 9.70241935483871e-05, + "loss": 0.1839, + "step": 2346 + }, + { + "epoch": 0.037552, + "grad_norm": 0.79296875, + "learning_rate": 9.70225806451613e-05, + "loss": 0.1653, + "step": 2347 + }, + { + "epoch": 0.037568, + "grad_norm": 1.0078125, + "learning_rate": 9.702096774193549e-05, + "loss": 0.2231, + "step": 2348 + }, + { + "epoch": 0.037584, + "grad_norm": 0.8125, + "learning_rate": 9.701935483870969e-05, + "loss": 0.1836, + "step": 2349 + }, + { + "epoch": 0.0376, + "grad_norm": 0.58203125, + "learning_rate": 9.701774193548387e-05, + "loss": 0.1653, + "step": 2350 + }, + { + "epoch": 0.037616, + "grad_norm": 0.80078125, + "learning_rate": 9.701612903225807e-05, + "loss": 0.1387, + "step": 2351 + }, + { + "epoch": 0.037632, + "grad_norm": 1.2421875, + "learning_rate": 9.701451612903226e-05, + "loss": 0.228, + "step": 2352 + }, + { + "epoch": 0.037648, + "grad_norm": 1.046875, + "learning_rate": 9.701290322580646e-05, + "loss": 0.2264, + "step": 2353 + }, + { + "epoch": 0.037664, + "grad_norm": 0.7734375, + "learning_rate": 9.701129032258066e-05, + "loss": 0.1898, + "step": 2354 + }, + { + "epoch": 0.03768, + "grad_norm": 1.1953125, + "learning_rate": 9.700967741935484e-05, + "loss": 0.1983, + "step": 2355 + }, + { + "epoch": 0.037696, + "grad_norm": 0.90234375, + "learning_rate": 9.700806451612904e-05, + "loss": 0.1769, + "step": 2356 + }, + { + "epoch": 0.037712, + "grad_norm": 1.0078125, + "learning_rate": 9.700645161290323e-05, + "loss": 0.1821, + "step": 2357 + }, + { + "epoch": 0.037728, + "grad_norm": 0.87890625, + "learning_rate": 9.700483870967743e-05, + "loss": 0.1481, + "step": 2358 + }, + { + "epoch": 0.037744, + "grad_norm": 0.984375, + "learning_rate": 9.700322580645161e-05, + "loss": 0.184, + "step": 2359 + }, + { + "epoch": 0.03776, + "grad_norm": 0.94140625, + "learning_rate": 9.700161290322581e-05, + "loss": 0.1947, + "step": 2360 + }, + { + "epoch": 0.037776, + "grad_norm": 0.5234375, + "learning_rate": 9.7e-05, + "loss": 0.1437, + "step": 2361 + }, + { + "epoch": 0.037792, + "grad_norm": 0.9453125, + "learning_rate": 9.69983870967742e-05, + "loss": 0.177, + "step": 2362 + }, + { + "epoch": 0.037808, + "grad_norm": 0.95703125, + "learning_rate": 9.699677419354839e-05, + "loss": 0.1834, + "step": 2363 + }, + { + "epoch": 0.037824, + "grad_norm": 1.5078125, + "learning_rate": 9.699516129032258e-05, + "loss": 0.1609, + "step": 2364 + }, + { + "epoch": 0.03784, + "grad_norm": 0.69140625, + "learning_rate": 9.699354838709677e-05, + "loss": 0.1757, + "step": 2365 + }, + { + "epoch": 0.037856, + "grad_norm": 1.125, + "learning_rate": 9.699193548387097e-05, + "loss": 0.2176, + "step": 2366 + }, + { + "epoch": 0.037872, + "grad_norm": 1.546875, + "learning_rate": 9.699032258064517e-05, + "loss": 0.1657, + "step": 2367 + }, + { + "epoch": 0.037888, + "grad_norm": 1.5078125, + "learning_rate": 9.698870967741937e-05, + "loss": 0.2199, + "step": 2368 + }, + { + "epoch": 0.037904, + "grad_norm": 1.0546875, + "learning_rate": 9.698709677419356e-05, + "loss": 0.186, + "step": 2369 + }, + { + "epoch": 0.03792, + "grad_norm": 0.6171875, + "learning_rate": 9.698548387096774e-05, + "loss": 0.1567, + "step": 2370 + }, + { + "epoch": 0.037936, + "grad_norm": 0.8984375, + "learning_rate": 9.698387096774194e-05, + "loss": 0.2144, + "step": 2371 + }, + { + "epoch": 0.037952, + "grad_norm": 0.609375, + "learning_rate": 9.698225806451613e-05, + "loss": 0.1709, + "step": 2372 + }, + { + "epoch": 0.037968, + "grad_norm": 1.171875, + "learning_rate": 9.698064516129033e-05, + "loss": 0.2146, + "step": 2373 + }, + { + "epoch": 0.037984, + "grad_norm": 1.0234375, + "learning_rate": 9.697903225806451e-05, + "loss": 0.1734, + "step": 2374 + }, + { + "epoch": 0.038, + "grad_norm": 0.5859375, + "learning_rate": 9.697741935483871e-05, + "loss": 0.1487, + "step": 2375 + }, + { + "epoch": 0.038016, + "grad_norm": 0.8984375, + "learning_rate": 9.69758064516129e-05, + "loss": 0.2259, + "step": 2376 + }, + { + "epoch": 0.038032, + "grad_norm": 0.8125, + "learning_rate": 9.69741935483871e-05, + "loss": 0.2, + "step": 2377 + }, + { + "epoch": 0.038048, + "grad_norm": 1.09375, + "learning_rate": 9.69725806451613e-05, + "loss": 0.2301, + "step": 2378 + }, + { + "epoch": 0.038064, + "grad_norm": 0.83203125, + "learning_rate": 9.69709677419355e-05, + "loss": 0.197, + "step": 2379 + }, + { + "epoch": 0.03808, + "grad_norm": 1.0078125, + "learning_rate": 9.696935483870968e-05, + "loss": 0.1651, + "step": 2380 + }, + { + "epoch": 0.038096, + "grad_norm": 1.3828125, + "learning_rate": 9.696774193548388e-05, + "loss": 0.2364, + "step": 2381 + }, + { + "epoch": 0.038112, + "grad_norm": 1.421875, + "learning_rate": 9.696612903225807e-05, + "loss": 0.217, + "step": 2382 + }, + { + "epoch": 0.038128, + "grad_norm": 0.77734375, + "learning_rate": 9.696451612903227e-05, + "loss": 0.1719, + "step": 2383 + }, + { + "epoch": 0.038144, + "grad_norm": 0.68359375, + "learning_rate": 9.696290322580646e-05, + "loss": 0.2079, + "step": 2384 + }, + { + "epoch": 0.03816, + "grad_norm": 0.80859375, + "learning_rate": 9.696129032258064e-05, + "loss": 0.1807, + "step": 2385 + }, + { + "epoch": 0.038176, + "grad_norm": 0.875, + "learning_rate": 9.695967741935484e-05, + "loss": 0.1963, + "step": 2386 + }, + { + "epoch": 0.038192, + "grad_norm": 0.89453125, + "learning_rate": 9.695806451612903e-05, + "loss": 0.187, + "step": 2387 + }, + { + "epoch": 0.038208, + "grad_norm": 1.1875, + "learning_rate": 9.695645161290323e-05, + "loss": 0.1756, + "step": 2388 + }, + { + "epoch": 0.038224, + "grad_norm": 1.2421875, + "learning_rate": 9.695483870967743e-05, + "loss": 0.2218, + "step": 2389 + }, + { + "epoch": 0.03824, + "grad_norm": 0.921875, + "learning_rate": 9.695322580645163e-05, + "loss": 0.1673, + "step": 2390 + }, + { + "epoch": 0.038256, + "grad_norm": 0.8828125, + "learning_rate": 9.695161290322581e-05, + "loss": 0.2091, + "step": 2391 + }, + { + "epoch": 0.038272, + "grad_norm": 0.7890625, + "learning_rate": 9.695000000000001e-05, + "loss": 0.2008, + "step": 2392 + }, + { + "epoch": 0.038288, + "grad_norm": 1.7265625, + "learning_rate": 9.69483870967742e-05, + "loss": 0.2135, + "step": 2393 + }, + { + "epoch": 0.038304, + "grad_norm": 1.3359375, + "learning_rate": 9.69467741935484e-05, + "loss": 0.2573, + "step": 2394 + }, + { + "epoch": 0.03832, + "grad_norm": 0.9375, + "learning_rate": 9.694516129032258e-05, + "loss": 0.2056, + "step": 2395 + }, + { + "epoch": 0.038336, + "grad_norm": 2.265625, + "learning_rate": 9.694354838709678e-05, + "loss": 0.1999, + "step": 2396 + }, + { + "epoch": 0.038352, + "grad_norm": 1.0390625, + "learning_rate": 9.694193548387097e-05, + "loss": 0.1666, + "step": 2397 + }, + { + "epoch": 0.038368, + "grad_norm": 0.84765625, + "learning_rate": 9.694032258064517e-05, + "loss": 0.1608, + "step": 2398 + }, + { + "epoch": 0.038384, + "grad_norm": 1.0703125, + "learning_rate": 9.693870967741936e-05, + "loss": 0.218, + "step": 2399 + }, + { + "epoch": 0.0384, + "grad_norm": 0.69921875, + "learning_rate": 9.693709677419354e-05, + "loss": 0.2025, + "step": 2400 + }, + { + "epoch": 0.038416, + "grad_norm": 0.76953125, + "learning_rate": 9.693548387096774e-05, + "loss": 0.17, + "step": 2401 + }, + { + "epoch": 0.038432, + "grad_norm": 0.65625, + "learning_rate": 9.693387096774194e-05, + "loss": 0.1785, + "step": 2402 + }, + { + "epoch": 0.038448, + "grad_norm": 0.734375, + "learning_rate": 9.693225806451614e-05, + "loss": 0.1605, + "step": 2403 + }, + { + "epoch": 0.038464, + "grad_norm": 1.0390625, + "learning_rate": 9.693064516129033e-05, + "loss": 0.215, + "step": 2404 + }, + { + "epoch": 0.03848, + "grad_norm": 1.0, + "learning_rate": 9.692903225806453e-05, + "loss": 0.2452, + "step": 2405 + }, + { + "epoch": 0.038496, + "grad_norm": 1.2109375, + "learning_rate": 9.692741935483871e-05, + "loss": 0.235, + "step": 2406 + }, + { + "epoch": 0.038512, + "grad_norm": 0.9609375, + "learning_rate": 9.692580645161291e-05, + "loss": 0.2202, + "step": 2407 + }, + { + "epoch": 0.038528, + "grad_norm": 0.59765625, + "learning_rate": 9.69241935483871e-05, + "loss": 0.1465, + "step": 2408 + }, + { + "epoch": 0.038544, + "grad_norm": 0.8046875, + "learning_rate": 9.69225806451613e-05, + "loss": 0.1843, + "step": 2409 + }, + { + "epoch": 0.03856, + "grad_norm": 1.4140625, + "learning_rate": 9.692096774193548e-05, + "loss": 0.2352, + "step": 2410 + }, + { + "epoch": 0.038576, + "grad_norm": 0.59765625, + "learning_rate": 9.691935483870968e-05, + "loss": 0.1679, + "step": 2411 + }, + { + "epoch": 0.038592, + "grad_norm": 1.0703125, + "learning_rate": 9.691774193548387e-05, + "loss": 0.2127, + "step": 2412 + }, + { + "epoch": 0.038608, + "grad_norm": 0.79296875, + "learning_rate": 9.691612903225807e-05, + "loss": 0.1917, + "step": 2413 + }, + { + "epoch": 0.038624, + "grad_norm": 0.7734375, + "learning_rate": 9.691451612903227e-05, + "loss": 0.1912, + "step": 2414 + }, + { + "epoch": 0.03864, + "grad_norm": 0.8203125, + "learning_rate": 9.691290322580647e-05, + "loss": 0.2211, + "step": 2415 + }, + { + "epoch": 0.038656, + "grad_norm": 1.1484375, + "learning_rate": 9.691129032258065e-05, + "loss": 0.2227, + "step": 2416 + }, + { + "epoch": 0.038672, + "grad_norm": 0.7890625, + "learning_rate": 9.690967741935484e-05, + "loss": 0.2125, + "step": 2417 + }, + { + "epoch": 0.038688, + "grad_norm": 0.8828125, + "learning_rate": 9.690806451612904e-05, + "loss": 0.1841, + "step": 2418 + }, + { + "epoch": 0.038704, + "grad_norm": 0.890625, + "learning_rate": 9.690645161290323e-05, + "loss": 0.1758, + "step": 2419 + }, + { + "epoch": 0.03872, + "grad_norm": 3.140625, + "learning_rate": 9.690483870967743e-05, + "loss": 0.2685, + "step": 2420 + }, + { + "epoch": 0.038736, + "grad_norm": 1.1640625, + "learning_rate": 9.690322580645161e-05, + "loss": 0.1963, + "step": 2421 + }, + { + "epoch": 0.038752, + "grad_norm": 0.921875, + "learning_rate": 9.690161290322581e-05, + "loss": 0.1757, + "step": 2422 + }, + { + "epoch": 0.038768, + "grad_norm": 1.15625, + "learning_rate": 9.69e-05, + "loss": 0.234, + "step": 2423 + }, + { + "epoch": 0.038784, + "grad_norm": 1.0625, + "learning_rate": 9.68983870967742e-05, + "loss": 0.197, + "step": 2424 + }, + { + "epoch": 0.0388, + "grad_norm": 0.7265625, + "learning_rate": 9.68967741935484e-05, + "loss": 0.1818, + "step": 2425 + }, + { + "epoch": 0.038816, + "grad_norm": 0.87109375, + "learning_rate": 9.689516129032258e-05, + "loss": 0.1696, + "step": 2426 + }, + { + "epoch": 0.038832, + "grad_norm": 0.8671875, + "learning_rate": 9.689354838709678e-05, + "loss": 0.188, + "step": 2427 + }, + { + "epoch": 0.038848, + "grad_norm": 0.89453125, + "learning_rate": 9.689193548387098e-05, + "loss": 0.2015, + "step": 2428 + }, + { + "epoch": 0.038864, + "grad_norm": 1.2890625, + "learning_rate": 9.689032258064517e-05, + "loss": 0.2412, + "step": 2429 + }, + { + "epoch": 0.03888, + "grad_norm": 1.1796875, + "learning_rate": 9.688870967741937e-05, + "loss": 0.1975, + "step": 2430 + }, + { + "epoch": 0.038896, + "grad_norm": 1.34375, + "learning_rate": 9.688709677419355e-05, + "loss": 0.1558, + "step": 2431 + }, + { + "epoch": 0.038912, + "grad_norm": 1.046875, + "learning_rate": 9.688548387096774e-05, + "loss": 0.1684, + "step": 2432 + }, + { + "epoch": 0.038928, + "grad_norm": 0.90625, + "learning_rate": 9.688387096774194e-05, + "loss": 0.2573, + "step": 2433 + }, + { + "epoch": 0.038944, + "grad_norm": 0.984375, + "learning_rate": 9.688225806451613e-05, + "loss": 0.1977, + "step": 2434 + }, + { + "epoch": 0.03896, + "grad_norm": 1.1875, + "learning_rate": 9.688064516129032e-05, + "loss": 0.2011, + "step": 2435 + }, + { + "epoch": 0.038976, + "grad_norm": 1.1015625, + "learning_rate": 9.687903225806451e-05, + "loss": 0.2095, + "step": 2436 + }, + { + "epoch": 0.038992, + "grad_norm": 0.8125, + "learning_rate": 9.687741935483871e-05, + "loss": 0.2005, + "step": 2437 + }, + { + "epoch": 0.039008, + "grad_norm": 0.765625, + "learning_rate": 9.687580645161291e-05, + "loss": 0.1827, + "step": 2438 + }, + { + "epoch": 0.039024, + "grad_norm": 0.9921875, + "learning_rate": 9.687419354838711e-05, + "loss": 0.2087, + "step": 2439 + }, + { + "epoch": 0.03904, + "grad_norm": 0.66015625, + "learning_rate": 9.68725806451613e-05, + "loss": 0.1898, + "step": 2440 + }, + { + "epoch": 0.039056, + "grad_norm": 0.984375, + "learning_rate": 9.68709677419355e-05, + "loss": 0.1988, + "step": 2441 + }, + { + "epoch": 0.039072, + "grad_norm": 1.2578125, + "learning_rate": 9.686935483870968e-05, + "loss": 0.2122, + "step": 2442 + }, + { + "epoch": 0.039088, + "grad_norm": 1.1328125, + "learning_rate": 9.686774193548388e-05, + "loss": 0.1828, + "step": 2443 + }, + { + "epoch": 0.039104, + "grad_norm": 1.1796875, + "learning_rate": 9.686612903225807e-05, + "loss": 0.2342, + "step": 2444 + }, + { + "epoch": 0.03912, + "grad_norm": 1.7109375, + "learning_rate": 9.686451612903227e-05, + "loss": 0.1767, + "step": 2445 + }, + { + "epoch": 0.039136, + "grad_norm": 1.3125, + "learning_rate": 9.686290322580645e-05, + "loss": 0.1717, + "step": 2446 + }, + { + "epoch": 0.039152, + "grad_norm": 0.765625, + "learning_rate": 9.686129032258064e-05, + "loss": 0.2081, + "step": 2447 + }, + { + "epoch": 0.039168, + "grad_norm": 1.40625, + "learning_rate": 9.685967741935484e-05, + "loss": 0.155, + "step": 2448 + }, + { + "epoch": 0.039184, + "grad_norm": 1.625, + "learning_rate": 9.685806451612904e-05, + "loss": 0.186, + "step": 2449 + }, + { + "epoch": 0.0392, + "grad_norm": 1.1171875, + "learning_rate": 9.685645161290324e-05, + "loss": 0.1998, + "step": 2450 + }, + { + "epoch": 0.039216, + "grad_norm": 0.90234375, + "learning_rate": 9.685483870967742e-05, + "loss": 0.2268, + "step": 2451 + }, + { + "epoch": 0.039232, + "grad_norm": 0.75390625, + "learning_rate": 9.685322580645162e-05, + "loss": 0.1613, + "step": 2452 + }, + { + "epoch": 0.039248, + "grad_norm": 0.81640625, + "learning_rate": 9.685161290322581e-05, + "loss": 0.2379, + "step": 2453 + }, + { + "epoch": 0.039264, + "grad_norm": 0.9453125, + "learning_rate": 9.685000000000001e-05, + "loss": 0.2129, + "step": 2454 + }, + { + "epoch": 0.03928, + "grad_norm": 1.09375, + "learning_rate": 9.68483870967742e-05, + "loss": 0.2157, + "step": 2455 + }, + { + "epoch": 0.039296, + "grad_norm": 1.4375, + "learning_rate": 9.68467741935484e-05, + "loss": 0.1844, + "step": 2456 + }, + { + "epoch": 0.039312, + "grad_norm": 0.796875, + "learning_rate": 9.684516129032258e-05, + "loss": 0.1516, + "step": 2457 + }, + { + "epoch": 0.039328, + "grad_norm": 0.9921875, + "learning_rate": 9.684354838709678e-05, + "loss": 0.198, + "step": 2458 + }, + { + "epoch": 0.039344, + "grad_norm": 0.90234375, + "learning_rate": 9.684193548387097e-05, + "loss": 0.1819, + "step": 2459 + }, + { + "epoch": 0.03936, + "grad_norm": 1.015625, + "learning_rate": 9.684032258064517e-05, + "loss": 0.1662, + "step": 2460 + }, + { + "epoch": 0.039376, + "grad_norm": 0.8984375, + "learning_rate": 9.683870967741935e-05, + "loss": 0.2031, + "step": 2461 + }, + { + "epoch": 0.039392, + "grad_norm": 0.82421875, + "learning_rate": 9.683709677419355e-05, + "loss": 0.1835, + "step": 2462 + }, + { + "epoch": 0.039408, + "grad_norm": 0.95703125, + "learning_rate": 9.683548387096775e-05, + "loss": 0.203, + "step": 2463 + }, + { + "epoch": 0.039424, + "grad_norm": 0.890625, + "learning_rate": 9.683387096774194e-05, + "loss": 0.2181, + "step": 2464 + }, + { + "epoch": 0.03944, + "grad_norm": 0.87109375, + "learning_rate": 9.683225806451614e-05, + "loss": 0.2256, + "step": 2465 + }, + { + "epoch": 0.039456, + "grad_norm": 1.046875, + "learning_rate": 9.683064516129032e-05, + "loss": 0.2023, + "step": 2466 + }, + { + "epoch": 0.039472, + "grad_norm": 0.6796875, + "learning_rate": 9.682903225806452e-05, + "loss": 0.1661, + "step": 2467 + }, + { + "epoch": 0.039488, + "grad_norm": 0.81640625, + "learning_rate": 9.682741935483871e-05, + "loss": 0.1995, + "step": 2468 + }, + { + "epoch": 0.039504, + "grad_norm": 1.5, + "learning_rate": 9.682580645161291e-05, + "loss": 0.215, + "step": 2469 + }, + { + "epoch": 0.03952, + "grad_norm": 1.0703125, + "learning_rate": 9.68241935483871e-05, + "loss": 0.1716, + "step": 2470 + }, + { + "epoch": 0.039536, + "grad_norm": 1.0703125, + "learning_rate": 9.68225806451613e-05, + "loss": 0.212, + "step": 2471 + }, + { + "epoch": 0.039552, + "grad_norm": 1.15625, + "learning_rate": 9.682096774193548e-05, + "loss": 0.2679, + "step": 2472 + }, + { + "epoch": 0.039568, + "grad_norm": 0.91015625, + "learning_rate": 9.681935483870968e-05, + "loss": 0.1569, + "step": 2473 + }, + { + "epoch": 0.039584, + "grad_norm": 1.2578125, + "learning_rate": 9.681774193548388e-05, + "loss": 0.2324, + "step": 2474 + }, + { + "epoch": 0.0396, + "grad_norm": 1.015625, + "learning_rate": 9.681612903225808e-05, + "loss": 0.2073, + "step": 2475 + }, + { + "epoch": 0.039616, + "grad_norm": 1.2109375, + "learning_rate": 9.681451612903227e-05, + "loss": 0.2351, + "step": 2476 + }, + { + "epoch": 0.039632, + "grad_norm": 0.80859375, + "learning_rate": 9.681290322580647e-05, + "loss": 0.2317, + "step": 2477 + }, + { + "epoch": 0.039648, + "grad_norm": 0.93359375, + "learning_rate": 9.681129032258065e-05, + "loss": 0.1712, + "step": 2478 + }, + { + "epoch": 0.039664, + "grad_norm": 1.3203125, + "learning_rate": 9.680967741935484e-05, + "loss": 0.1845, + "step": 2479 + }, + { + "epoch": 0.03968, + "grad_norm": 0.87109375, + "learning_rate": 9.680806451612904e-05, + "loss": 0.2123, + "step": 2480 + }, + { + "epoch": 0.039696, + "grad_norm": 1.078125, + "learning_rate": 9.680645161290322e-05, + "loss": 0.2288, + "step": 2481 + }, + { + "epoch": 0.039712, + "grad_norm": 0.640625, + "learning_rate": 9.680483870967742e-05, + "loss": 0.1424, + "step": 2482 + }, + { + "epoch": 0.039728, + "grad_norm": 1.4453125, + "learning_rate": 9.680322580645161e-05, + "loss": 0.2273, + "step": 2483 + }, + { + "epoch": 0.039744, + "grad_norm": 0.90625, + "learning_rate": 9.680161290322581e-05, + "loss": 0.1995, + "step": 2484 + }, + { + "epoch": 0.03976, + "grad_norm": 0.74609375, + "learning_rate": 9.680000000000001e-05, + "loss": 0.1774, + "step": 2485 + }, + { + "epoch": 0.039776, + "grad_norm": 0.88671875, + "learning_rate": 9.679838709677421e-05, + "loss": 0.1973, + "step": 2486 + }, + { + "epoch": 0.039792, + "grad_norm": 1.0390625, + "learning_rate": 9.67967741935484e-05, + "loss": 0.2051, + "step": 2487 + }, + { + "epoch": 0.039808, + "grad_norm": 0.9609375, + "learning_rate": 9.67951612903226e-05, + "loss": 0.2134, + "step": 2488 + }, + { + "epoch": 0.039824, + "grad_norm": 1.4375, + "learning_rate": 9.679354838709678e-05, + "loss": 0.2031, + "step": 2489 + }, + { + "epoch": 0.03984, + "grad_norm": 0.91796875, + "learning_rate": 9.679193548387098e-05, + "loss": 0.1894, + "step": 2490 + }, + { + "epoch": 0.039856, + "grad_norm": 1.4375, + "learning_rate": 9.679032258064517e-05, + "loss": 0.2196, + "step": 2491 + }, + { + "epoch": 0.039872, + "grad_norm": 1.0625, + "learning_rate": 9.678870967741937e-05, + "loss": 0.28, + "step": 2492 + }, + { + "epoch": 0.039888, + "grad_norm": 1.140625, + "learning_rate": 9.678709677419355e-05, + "loss": 0.2215, + "step": 2493 + }, + { + "epoch": 0.039904, + "grad_norm": 1.1796875, + "learning_rate": 9.678548387096774e-05, + "loss": 0.2081, + "step": 2494 + }, + { + "epoch": 0.03992, + "grad_norm": 1.3203125, + "learning_rate": 9.678387096774194e-05, + "loss": 0.1812, + "step": 2495 + }, + { + "epoch": 0.039936, + "grad_norm": 0.90234375, + "learning_rate": 9.678225806451612e-05, + "loss": 0.229, + "step": 2496 + }, + { + "epoch": 0.039952, + "grad_norm": 0.890625, + "learning_rate": 9.678064516129032e-05, + "loss": 0.1654, + "step": 2497 + }, + { + "epoch": 0.039968, + "grad_norm": 0.83203125, + "learning_rate": 9.677903225806452e-05, + "loss": 0.181, + "step": 2498 + }, + { + "epoch": 0.039984, + "grad_norm": 0.89453125, + "learning_rate": 9.677741935483872e-05, + "loss": 0.1875, + "step": 2499 + }, + { + "epoch": 0.04, + "grad_norm": 0.73828125, + "learning_rate": 9.677580645161291e-05, + "loss": 0.1899, + "step": 2500 + }, + { + "epoch": 0.040016, + "grad_norm": 0.8125, + "learning_rate": 9.677419354838711e-05, + "loss": 0.1578, + "step": 2501 + }, + { + "epoch": 0.040032, + "grad_norm": 1.0234375, + "learning_rate": 9.67725806451613e-05, + "loss": 0.1929, + "step": 2502 + }, + { + "epoch": 0.040048, + "grad_norm": 0.74609375, + "learning_rate": 9.67709677419355e-05, + "loss": 0.2074, + "step": 2503 + }, + { + "epoch": 0.040064, + "grad_norm": 0.78515625, + "learning_rate": 9.676935483870968e-05, + "loss": 0.1618, + "step": 2504 + }, + { + "epoch": 0.04008, + "grad_norm": 0.98828125, + "learning_rate": 9.676774193548388e-05, + "loss": 0.1861, + "step": 2505 + }, + { + "epoch": 0.040096, + "grad_norm": 0.84765625, + "learning_rate": 9.676612903225807e-05, + "loss": 0.2147, + "step": 2506 + }, + { + "epoch": 0.040112, + "grad_norm": 0.6796875, + "learning_rate": 9.676451612903226e-05, + "loss": 0.1629, + "step": 2507 + }, + { + "epoch": 0.040128, + "grad_norm": 0.515625, + "learning_rate": 9.676290322580645e-05, + "loss": 0.1409, + "step": 2508 + }, + { + "epoch": 0.040144, + "grad_norm": 0.99609375, + "learning_rate": 9.676129032258065e-05, + "loss": 0.2151, + "step": 2509 + }, + { + "epoch": 0.04016, + "grad_norm": 0.9140625, + "learning_rate": 9.675967741935485e-05, + "loss": 0.2228, + "step": 2510 + }, + { + "epoch": 0.040176, + "grad_norm": 0.89453125, + "learning_rate": 9.675806451612904e-05, + "loss": 0.1944, + "step": 2511 + }, + { + "epoch": 0.040192, + "grad_norm": 0.6796875, + "learning_rate": 9.675645161290324e-05, + "loss": 0.165, + "step": 2512 + }, + { + "epoch": 0.040208, + "grad_norm": 0.75, + "learning_rate": 9.675483870967742e-05, + "loss": 0.1785, + "step": 2513 + }, + { + "epoch": 0.040224, + "grad_norm": 1.0390625, + "learning_rate": 9.675322580645162e-05, + "loss": 0.1917, + "step": 2514 + }, + { + "epoch": 0.04024, + "grad_norm": 1.015625, + "learning_rate": 9.675161290322581e-05, + "loss": 0.1825, + "step": 2515 + }, + { + "epoch": 0.040256, + "grad_norm": 1.21875, + "learning_rate": 9.675000000000001e-05, + "loss": 0.1677, + "step": 2516 + }, + { + "epoch": 0.040272, + "grad_norm": 0.94921875, + "learning_rate": 9.674838709677419e-05, + "loss": 0.1525, + "step": 2517 + }, + { + "epoch": 0.040288, + "grad_norm": 0.92578125, + "learning_rate": 9.674677419354839e-05, + "loss": 0.1964, + "step": 2518 + }, + { + "epoch": 0.040304, + "grad_norm": 0.76171875, + "learning_rate": 9.674516129032258e-05, + "loss": 0.1678, + "step": 2519 + }, + { + "epoch": 0.04032, + "grad_norm": 0.84375, + "learning_rate": 9.674354838709678e-05, + "loss": 0.2112, + "step": 2520 + }, + { + "epoch": 0.040336, + "grad_norm": 1.3828125, + "learning_rate": 9.674193548387096e-05, + "loss": 0.2037, + "step": 2521 + }, + { + "epoch": 0.040352, + "grad_norm": 1.015625, + "learning_rate": 9.674032258064516e-05, + "loss": 0.2063, + "step": 2522 + }, + { + "epoch": 0.040368, + "grad_norm": 0.84375, + "learning_rate": 9.673870967741936e-05, + "loss": 0.2261, + "step": 2523 + }, + { + "epoch": 0.040384, + "grad_norm": 0.94140625, + "learning_rate": 9.673709677419356e-05, + "loss": 0.2025, + "step": 2524 + }, + { + "epoch": 0.0404, + "grad_norm": 0.95703125, + "learning_rate": 9.673548387096775e-05, + "loss": 0.1977, + "step": 2525 + }, + { + "epoch": 0.040416, + "grad_norm": 1.0078125, + "learning_rate": 9.673387096774194e-05, + "loss": 0.1784, + "step": 2526 + }, + { + "epoch": 0.040432, + "grad_norm": 1.1875, + "learning_rate": 9.673225806451614e-05, + "loss": 0.2044, + "step": 2527 + }, + { + "epoch": 0.040448, + "grad_norm": 0.75390625, + "learning_rate": 9.673064516129032e-05, + "loss": 0.1902, + "step": 2528 + }, + { + "epoch": 0.040464, + "grad_norm": 0.859375, + "learning_rate": 9.672903225806452e-05, + "loss": 0.1971, + "step": 2529 + }, + { + "epoch": 0.04048, + "grad_norm": 0.6953125, + "learning_rate": 9.672741935483871e-05, + "loss": 0.1463, + "step": 2530 + }, + { + "epoch": 0.040496, + "grad_norm": 0.859375, + "learning_rate": 9.67258064516129e-05, + "loss": 0.1858, + "step": 2531 + }, + { + "epoch": 0.040512, + "grad_norm": 0.89453125, + "learning_rate": 9.672419354838709e-05, + "loss": 0.2006, + "step": 2532 + }, + { + "epoch": 0.040528, + "grad_norm": 1.03125, + "learning_rate": 9.672258064516129e-05, + "loss": 0.165, + "step": 2533 + }, + { + "epoch": 0.040544, + "grad_norm": 1.1796875, + "learning_rate": 9.672096774193549e-05, + "loss": 0.2242, + "step": 2534 + }, + { + "epoch": 0.04056, + "grad_norm": 0.8125, + "learning_rate": 9.671935483870969e-05, + "loss": 0.1974, + "step": 2535 + }, + { + "epoch": 0.040576, + "grad_norm": 1.359375, + "learning_rate": 9.671774193548388e-05, + "loss": 0.1845, + "step": 2536 + }, + { + "epoch": 0.040592, + "grad_norm": 1.03125, + "learning_rate": 9.671612903225808e-05, + "loss": 0.1856, + "step": 2537 + }, + { + "epoch": 0.040608, + "grad_norm": 1.1171875, + "learning_rate": 9.671451612903226e-05, + "loss": 0.1955, + "step": 2538 + }, + { + "epoch": 0.040624, + "grad_norm": 0.84375, + "learning_rate": 9.671290322580646e-05, + "loss": 0.2186, + "step": 2539 + }, + { + "epoch": 0.04064, + "grad_norm": 0.875, + "learning_rate": 9.671129032258065e-05, + "loss": 0.1901, + "step": 2540 + }, + { + "epoch": 0.040656, + "grad_norm": 1.234375, + "learning_rate": 9.670967741935484e-05, + "loss": 0.2408, + "step": 2541 + }, + { + "epoch": 0.040672, + "grad_norm": 1.6953125, + "learning_rate": 9.670806451612903e-05, + "loss": 0.1659, + "step": 2542 + }, + { + "epoch": 0.040688, + "grad_norm": 0.9609375, + "learning_rate": 9.670645161290322e-05, + "loss": 0.2157, + "step": 2543 + }, + { + "epoch": 0.040704, + "grad_norm": 1.3828125, + "learning_rate": 9.670483870967742e-05, + "loss": 0.1616, + "step": 2544 + }, + { + "epoch": 0.04072, + "grad_norm": 1.125, + "learning_rate": 9.670322580645162e-05, + "loss": 0.2126, + "step": 2545 + }, + { + "epoch": 0.040736, + "grad_norm": 1.1171875, + "learning_rate": 9.670161290322582e-05, + "loss": 0.2096, + "step": 2546 + }, + { + "epoch": 0.040752, + "grad_norm": 0.98046875, + "learning_rate": 9.67e-05, + "loss": 0.1993, + "step": 2547 + }, + { + "epoch": 0.040768, + "grad_norm": 1.3359375, + "learning_rate": 9.66983870967742e-05, + "loss": 0.2304, + "step": 2548 + }, + { + "epoch": 0.040784, + "grad_norm": 1.4375, + "learning_rate": 9.669677419354839e-05, + "loss": 0.2305, + "step": 2549 + }, + { + "epoch": 0.0408, + "grad_norm": 1.140625, + "learning_rate": 9.669516129032259e-05, + "loss": 0.1921, + "step": 2550 + }, + { + "epoch": 0.040816, + "grad_norm": 1.109375, + "learning_rate": 9.669354838709678e-05, + "loss": 0.1629, + "step": 2551 + }, + { + "epoch": 0.040832, + "grad_norm": 1.0, + "learning_rate": 9.669193548387098e-05, + "loss": 0.2144, + "step": 2552 + }, + { + "epoch": 0.040848, + "grad_norm": 1.5625, + "learning_rate": 9.669032258064516e-05, + "loss": 0.2093, + "step": 2553 + }, + { + "epoch": 0.040864, + "grad_norm": 1.6484375, + "learning_rate": 9.668870967741936e-05, + "loss": 0.1717, + "step": 2554 + }, + { + "epoch": 0.04088, + "grad_norm": 1.0078125, + "learning_rate": 9.668709677419355e-05, + "loss": 0.1732, + "step": 2555 + }, + { + "epoch": 0.040896, + "grad_norm": 0.828125, + "learning_rate": 9.668548387096773e-05, + "loss": 0.194, + "step": 2556 + }, + { + "epoch": 0.040912, + "grad_norm": 0.89453125, + "learning_rate": 9.668387096774193e-05, + "loss": 0.2152, + "step": 2557 + }, + { + "epoch": 0.040928, + "grad_norm": 0.99609375, + "learning_rate": 9.668225806451613e-05, + "loss": 0.2232, + "step": 2558 + }, + { + "epoch": 0.040944, + "grad_norm": 1.53125, + "learning_rate": 9.668064516129033e-05, + "loss": 0.1699, + "step": 2559 + }, + { + "epoch": 0.04096, + "grad_norm": 0.71875, + "learning_rate": 9.667903225806452e-05, + "loss": 0.1881, + "step": 2560 + }, + { + "epoch": 0.040976, + "grad_norm": 1.4375, + "learning_rate": 9.667741935483872e-05, + "loss": 0.1617, + "step": 2561 + }, + { + "epoch": 0.040992, + "grad_norm": 1.078125, + "learning_rate": 9.66758064516129e-05, + "loss": 0.1768, + "step": 2562 + }, + { + "epoch": 0.041008, + "grad_norm": 1.0625, + "learning_rate": 9.66741935483871e-05, + "loss": 0.1716, + "step": 2563 + }, + { + "epoch": 0.041024, + "grad_norm": 1.0078125, + "learning_rate": 9.667258064516129e-05, + "loss": 0.2005, + "step": 2564 + }, + { + "epoch": 0.04104, + "grad_norm": 0.7421875, + "learning_rate": 9.667096774193549e-05, + "loss": 0.1916, + "step": 2565 + }, + { + "epoch": 0.041056, + "grad_norm": 1.2421875, + "learning_rate": 9.666935483870968e-05, + "loss": 0.2101, + "step": 2566 + }, + { + "epoch": 0.041072, + "grad_norm": 1.6875, + "learning_rate": 9.666774193548388e-05, + "loss": 0.1836, + "step": 2567 + }, + { + "epoch": 0.041088, + "grad_norm": 1.96875, + "learning_rate": 9.666612903225806e-05, + "loss": 0.2468, + "step": 2568 + }, + { + "epoch": 0.041104, + "grad_norm": 1.71875, + "learning_rate": 9.666451612903226e-05, + "loss": 0.184, + "step": 2569 + }, + { + "epoch": 0.04112, + "grad_norm": 0.7265625, + "learning_rate": 9.666290322580646e-05, + "loss": 0.183, + "step": 2570 + }, + { + "epoch": 0.041136, + "grad_norm": 1.171875, + "learning_rate": 9.666129032258065e-05, + "loss": 0.2399, + "step": 2571 + }, + { + "epoch": 0.041152, + "grad_norm": 1.3203125, + "learning_rate": 9.665967741935485e-05, + "loss": 0.2443, + "step": 2572 + }, + { + "epoch": 0.041168, + "grad_norm": 0.9375, + "learning_rate": 9.665806451612903e-05, + "loss": 0.2237, + "step": 2573 + }, + { + "epoch": 0.041184, + "grad_norm": 0.89453125, + "learning_rate": 9.665645161290323e-05, + "loss": 0.1761, + "step": 2574 + }, + { + "epoch": 0.0412, + "grad_norm": 0.875, + "learning_rate": 9.665483870967742e-05, + "loss": 0.1925, + "step": 2575 + }, + { + "epoch": 0.041216, + "grad_norm": 0.98828125, + "learning_rate": 9.665322580645162e-05, + "loss": 0.1691, + "step": 2576 + }, + { + "epoch": 0.041232, + "grad_norm": 1.515625, + "learning_rate": 9.66516129032258e-05, + "loss": 0.2166, + "step": 2577 + }, + { + "epoch": 0.041248, + "grad_norm": 0.96484375, + "learning_rate": 9.665e-05, + "loss": 0.1461, + "step": 2578 + }, + { + "epoch": 0.041264, + "grad_norm": 0.87890625, + "learning_rate": 9.664838709677419e-05, + "loss": 0.1779, + "step": 2579 + }, + { + "epoch": 0.04128, + "grad_norm": 1.2734375, + "learning_rate": 9.664677419354839e-05, + "loss": 0.1521, + "step": 2580 + }, + { + "epoch": 0.041296, + "grad_norm": 1.375, + "learning_rate": 9.664516129032259e-05, + "loss": 0.2108, + "step": 2581 + }, + { + "epoch": 0.041312, + "grad_norm": 1.0078125, + "learning_rate": 9.664354838709678e-05, + "loss": 0.158, + "step": 2582 + }, + { + "epoch": 0.041328, + "grad_norm": 1.3046875, + "learning_rate": 9.664193548387098e-05, + "loss": 0.183, + "step": 2583 + }, + { + "epoch": 0.041344, + "grad_norm": 0.9140625, + "learning_rate": 9.664032258064518e-05, + "loss": 0.1996, + "step": 2584 + }, + { + "epoch": 0.04136, + "grad_norm": 0.7421875, + "learning_rate": 9.663870967741936e-05, + "loss": 0.1751, + "step": 2585 + }, + { + "epoch": 0.041376, + "grad_norm": 0.7421875, + "learning_rate": 9.663709677419356e-05, + "loss": 0.189, + "step": 2586 + }, + { + "epoch": 0.041392, + "grad_norm": 1.2109375, + "learning_rate": 9.663548387096775e-05, + "loss": 0.1703, + "step": 2587 + }, + { + "epoch": 0.041408, + "grad_norm": 0.94140625, + "learning_rate": 9.663387096774193e-05, + "loss": 0.1909, + "step": 2588 + }, + { + "epoch": 0.041424, + "grad_norm": 1.2734375, + "learning_rate": 9.663225806451613e-05, + "loss": 0.1486, + "step": 2589 + }, + { + "epoch": 0.04144, + "grad_norm": 1.203125, + "learning_rate": 9.663064516129032e-05, + "loss": 0.2006, + "step": 2590 + }, + { + "epoch": 0.041456, + "grad_norm": 1.203125, + "learning_rate": 9.662903225806452e-05, + "loss": 0.2143, + "step": 2591 + }, + { + "epoch": 0.041472, + "grad_norm": 0.8046875, + "learning_rate": 9.66274193548387e-05, + "loss": 0.1855, + "step": 2592 + }, + { + "epoch": 0.041488, + "grad_norm": 1.25, + "learning_rate": 9.66258064516129e-05, + "loss": 0.2038, + "step": 2593 + }, + { + "epoch": 0.041504, + "grad_norm": 1.0546875, + "learning_rate": 9.66241935483871e-05, + "loss": 0.1828, + "step": 2594 + }, + { + "epoch": 0.04152, + "grad_norm": 1.46875, + "learning_rate": 9.66225806451613e-05, + "loss": 0.2161, + "step": 2595 + }, + { + "epoch": 0.041536, + "grad_norm": 1.3515625, + "learning_rate": 9.662096774193549e-05, + "loss": 0.193, + "step": 2596 + }, + { + "epoch": 0.041552, + "grad_norm": 0.82421875, + "learning_rate": 9.661935483870969e-05, + "loss": 0.1794, + "step": 2597 + }, + { + "epoch": 0.041568, + "grad_norm": 1.1015625, + "learning_rate": 9.661774193548388e-05, + "loss": 0.2124, + "step": 2598 + }, + { + "epoch": 0.041584, + "grad_norm": 0.87890625, + "learning_rate": 9.661612903225807e-05, + "loss": 0.1932, + "step": 2599 + }, + { + "epoch": 0.0416, + "grad_norm": 0.96484375, + "learning_rate": 9.661451612903226e-05, + "loss": 0.1725, + "step": 2600 + }, + { + "epoch": 0.041616, + "grad_norm": 1.0546875, + "learning_rate": 9.661290322580646e-05, + "loss": 0.2139, + "step": 2601 + }, + { + "epoch": 0.041632, + "grad_norm": 0.90234375, + "learning_rate": 9.661129032258065e-05, + "loss": 0.1692, + "step": 2602 + }, + { + "epoch": 0.041648, + "grad_norm": 0.890625, + "learning_rate": 9.660967741935483e-05, + "loss": 0.1786, + "step": 2603 + }, + { + "epoch": 0.041664, + "grad_norm": 1.0859375, + "learning_rate": 9.660806451612903e-05, + "loss": 0.1948, + "step": 2604 + }, + { + "epoch": 0.04168, + "grad_norm": 1.0390625, + "learning_rate": 9.660645161290323e-05, + "loss": 0.201, + "step": 2605 + }, + { + "epoch": 0.041696, + "grad_norm": 1.296875, + "learning_rate": 9.660483870967743e-05, + "loss": 0.2223, + "step": 2606 + }, + { + "epoch": 0.041712, + "grad_norm": 0.76171875, + "learning_rate": 9.660322580645162e-05, + "loss": 0.1908, + "step": 2607 + }, + { + "epoch": 0.041728, + "grad_norm": 0.76953125, + "learning_rate": 9.660161290322582e-05, + "loss": 0.1909, + "step": 2608 + }, + { + "epoch": 0.041744, + "grad_norm": 1.5859375, + "learning_rate": 9.66e-05, + "loss": 0.2055, + "step": 2609 + }, + { + "epoch": 0.04176, + "grad_norm": 1.1484375, + "learning_rate": 9.65983870967742e-05, + "loss": 0.1739, + "step": 2610 + }, + { + "epoch": 0.041776, + "grad_norm": 2.421875, + "learning_rate": 9.659677419354839e-05, + "loss": 0.2319, + "step": 2611 + }, + { + "epoch": 0.041792, + "grad_norm": 1.390625, + "learning_rate": 9.659516129032259e-05, + "loss": 0.214, + "step": 2612 + }, + { + "epoch": 0.041808, + "grad_norm": 1.3125, + "learning_rate": 9.659354838709677e-05, + "loss": 0.1877, + "step": 2613 + }, + { + "epoch": 0.041824, + "grad_norm": 0.921875, + "learning_rate": 9.659193548387097e-05, + "loss": 0.2042, + "step": 2614 + }, + { + "epoch": 0.04184, + "grad_norm": 0.96484375, + "learning_rate": 9.659032258064516e-05, + "loss": 0.1962, + "step": 2615 + }, + { + "epoch": 0.041856, + "grad_norm": 0.765625, + "learning_rate": 9.658870967741936e-05, + "loss": 0.1853, + "step": 2616 + }, + { + "epoch": 0.041872, + "grad_norm": 0.95703125, + "learning_rate": 9.658709677419355e-05, + "loss": 0.1834, + "step": 2617 + }, + { + "epoch": 0.041888, + "grad_norm": 1.2109375, + "learning_rate": 9.658548387096775e-05, + "loss": 0.1888, + "step": 2618 + }, + { + "epoch": 0.041904, + "grad_norm": 1.3515625, + "learning_rate": 9.658387096774195e-05, + "loss": 0.1897, + "step": 2619 + }, + { + "epoch": 0.04192, + "grad_norm": 0.96875, + "learning_rate": 9.658225806451613e-05, + "loss": 0.1455, + "step": 2620 + }, + { + "epoch": 0.041936, + "grad_norm": 0.8359375, + "learning_rate": 9.658064516129033e-05, + "loss": 0.1846, + "step": 2621 + }, + { + "epoch": 0.041952, + "grad_norm": 0.6953125, + "learning_rate": 9.657903225806452e-05, + "loss": 0.1772, + "step": 2622 + }, + { + "epoch": 0.041968, + "grad_norm": 0.87109375, + "learning_rate": 9.657741935483872e-05, + "loss": 0.156, + "step": 2623 + }, + { + "epoch": 0.041984, + "grad_norm": 0.734375, + "learning_rate": 9.65758064516129e-05, + "loss": 0.1889, + "step": 2624 + }, + { + "epoch": 0.042, + "grad_norm": 1.03125, + "learning_rate": 9.65741935483871e-05, + "loss": 0.2029, + "step": 2625 + }, + { + "epoch": 0.042016, + "grad_norm": 2.015625, + "learning_rate": 9.657258064516129e-05, + "loss": 0.2247, + "step": 2626 + }, + { + "epoch": 0.042032, + "grad_norm": 0.91015625, + "learning_rate": 9.657096774193549e-05, + "loss": 0.2005, + "step": 2627 + }, + { + "epoch": 0.042048, + "grad_norm": 1.3359375, + "learning_rate": 9.656935483870967e-05, + "loss": 0.2069, + "step": 2628 + }, + { + "epoch": 0.042064, + "grad_norm": 1.015625, + "learning_rate": 9.656774193548387e-05, + "loss": 0.1854, + "step": 2629 + }, + { + "epoch": 0.04208, + "grad_norm": 1.1171875, + "learning_rate": 9.656612903225807e-05, + "loss": 0.2114, + "step": 2630 + }, + { + "epoch": 0.042096, + "grad_norm": 0.66015625, + "learning_rate": 9.656451612903227e-05, + "loss": 0.1759, + "step": 2631 + }, + { + "epoch": 0.042112, + "grad_norm": 1.2109375, + "learning_rate": 9.656290322580646e-05, + "loss": 0.1962, + "step": 2632 + }, + { + "epoch": 0.042128, + "grad_norm": 0.84765625, + "learning_rate": 9.656129032258066e-05, + "loss": 0.17, + "step": 2633 + }, + { + "epoch": 0.042144, + "grad_norm": 0.91796875, + "learning_rate": 9.655967741935485e-05, + "loss": 0.1656, + "step": 2634 + }, + { + "epoch": 0.04216, + "grad_norm": 0.6875, + "learning_rate": 9.655806451612903e-05, + "loss": 0.1797, + "step": 2635 + }, + { + "epoch": 0.042176, + "grad_norm": 0.609375, + "learning_rate": 9.655645161290323e-05, + "loss": 0.1752, + "step": 2636 + }, + { + "epoch": 0.042192, + "grad_norm": 1.2578125, + "learning_rate": 9.655483870967742e-05, + "loss": 0.16, + "step": 2637 + }, + { + "epoch": 0.042208, + "grad_norm": 1.5, + "learning_rate": 9.655322580645162e-05, + "loss": 0.2051, + "step": 2638 + }, + { + "epoch": 0.042224, + "grad_norm": 1.1640625, + "learning_rate": 9.65516129032258e-05, + "loss": 0.2221, + "step": 2639 + }, + { + "epoch": 0.04224, + "grad_norm": 1.3046875, + "learning_rate": 9.655e-05, + "loss": 0.2057, + "step": 2640 + }, + { + "epoch": 0.042256, + "grad_norm": 0.65234375, + "learning_rate": 9.65483870967742e-05, + "loss": 0.1662, + "step": 2641 + }, + { + "epoch": 0.042272, + "grad_norm": 0.56640625, + "learning_rate": 9.65467741935484e-05, + "loss": 0.145, + "step": 2642 + }, + { + "epoch": 0.042288, + "grad_norm": 0.9375, + "learning_rate": 9.654516129032259e-05, + "loss": 0.2175, + "step": 2643 + }, + { + "epoch": 0.042304, + "grad_norm": 1.21875, + "learning_rate": 9.654354838709679e-05, + "loss": 0.1715, + "step": 2644 + }, + { + "epoch": 0.04232, + "grad_norm": 0.87890625, + "learning_rate": 9.654193548387097e-05, + "loss": 0.2042, + "step": 2645 + }, + { + "epoch": 0.042336, + "grad_norm": 1.4296875, + "learning_rate": 9.654032258064517e-05, + "loss": 0.2001, + "step": 2646 + }, + { + "epoch": 0.042352, + "grad_norm": 0.64453125, + "learning_rate": 9.653870967741936e-05, + "loss": 0.1766, + "step": 2647 + }, + { + "epoch": 0.042368, + "grad_norm": 0.9140625, + "learning_rate": 9.653709677419356e-05, + "loss": 0.1984, + "step": 2648 + }, + { + "epoch": 0.042384, + "grad_norm": 0.75, + "learning_rate": 9.653548387096774e-05, + "loss": 0.1615, + "step": 2649 + }, + { + "epoch": 0.0424, + "grad_norm": 0.99609375, + "learning_rate": 9.653387096774193e-05, + "loss": 0.164, + "step": 2650 + }, + { + "epoch": 0.042416, + "grad_norm": 1.0859375, + "learning_rate": 9.653225806451613e-05, + "loss": 0.1605, + "step": 2651 + }, + { + "epoch": 0.042432, + "grad_norm": 0.79296875, + "learning_rate": 9.653064516129032e-05, + "loss": 0.2173, + "step": 2652 + }, + { + "epoch": 0.042448, + "grad_norm": 0.70703125, + "learning_rate": 9.652903225806452e-05, + "loss": 0.1642, + "step": 2653 + }, + { + "epoch": 0.042464, + "grad_norm": 0.578125, + "learning_rate": 9.652741935483872e-05, + "loss": 0.1817, + "step": 2654 + }, + { + "epoch": 0.04248, + "grad_norm": 0.87109375, + "learning_rate": 9.652580645161292e-05, + "loss": 0.1893, + "step": 2655 + }, + { + "epoch": 0.042496, + "grad_norm": 0.87109375, + "learning_rate": 9.65241935483871e-05, + "loss": 0.1548, + "step": 2656 + }, + { + "epoch": 0.042512, + "grad_norm": 1.0703125, + "learning_rate": 9.65225806451613e-05, + "loss": 0.1694, + "step": 2657 + }, + { + "epoch": 0.042528, + "grad_norm": 1.0, + "learning_rate": 9.652096774193549e-05, + "loss": 0.191, + "step": 2658 + }, + { + "epoch": 0.042544, + "grad_norm": 0.85546875, + "learning_rate": 9.651935483870969e-05, + "loss": 0.1426, + "step": 2659 + }, + { + "epoch": 0.04256, + "grad_norm": 0.91015625, + "learning_rate": 9.651774193548387e-05, + "loss": 0.1746, + "step": 2660 + }, + { + "epoch": 0.042576, + "grad_norm": 0.7109375, + "learning_rate": 9.651612903225807e-05, + "loss": 0.1904, + "step": 2661 + }, + { + "epoch": 0.042592, + "grad_norm": 1.25, + "learning_rate": 9.651451612903226e-05, + "loss": 0.1662, + "step": 2662 + }, + { + "epoch": 0.042608, + "grad_norm": 1.7265625, + "learning_rate": 9.651290322580646e-05, + "loss": 0.1458, + "step": 2663 + }, + { + "epoch": 0.042624, + "grad_norm": 1.484375, + "learning_rate": 9.651129032258064e-05, + "loss": 0.204, + "step": 2664 + }, + { + "epoch": 0.04264, + "grad_norm": 1.0390625, + "learning_rate": 9.650967741935484e-05, + "loss": 0.1933, + "step": 2665 + }, + { + "epoch": 0.042656, + "grad_norm": 0.92578125, + "learning_rate": 9.650806451612904e-05, + "loss": 0.1992, + "step": 2666 + }, + { + "epoch": 0.042672, + "grad_norm": 0.9375, + "learning_rate": 9.650645161290323e-05, + "loss": 0.166, + "step": 2667 + }, + { + "epoch": 0.042688, + "grad_norm": 0.8828125, + "learning_rate": 9.650483870967743e-05, + "loss": 0.2205, + "step": 2668 + }, + { + "epoch": 0.042704, + "grad_norm": 0.74609375, + "learning_rate": 9.650322580645162e-05, + "loss": 0.1497, + "step": 2669 + }, + { + "epoch": 0.04272, + "grad_norm": 1.375, + "learning_rate": 9.650161290322581e-05, + "loss": 0.2422, + "step": 2670 + }, + { + "epoch": 0.042736, + "grad_norm": 0.93359375, + "learning_rate": 9.65e-05, + "loss": 0.2274, + "step": 2671 + }, + { + "epoch": 0.042752, + "grad_norm": 1.5546875, + "learning_rate": 9.64983870967742e-05, + "loss": 0.2236, + "step": 2672 + }, + { + "epoch": 0.042768, + "grad_norm": 1.0546875, + "learning_rate": 9.649677419354839e-05, + "loss": 0.1939, + "step": 2673 + }, + { + "epoch": 0.042784, + "grad_norm": 0.64453125, + "learning_rate": 9.649516129032259e-05, + "loss": 0.1745, + "step": 2674 + }, + { + "epoch": 0.0428, + "grad_norm": 0.8828125, + "learning_rate": 9.649354838709677e-05, + "loss": 0.192, + "step": 2675 + }, + { + "epoch": 0.042816, + "grad_norm": 0.78515625, + "learning_rate": 9.649193548387097e-05, + "loss": 0.1918, + "step": 2676 + }, + { + "epoch": 0.042832, + "grad_norm": 1.2265625, + "learning_rate": 9.649032258064517e-05, + "loss": 0.2028, + "step": 2677 + }, + { + "epoch": 0.042848, + "grad_norm": 0.81640625, + "learning_rate": 9.648870967741936e-05, + "loss": 0.1882, + "step": 2678 + }, + { + "epoch": 0.042864, + "grad_norm": 0.8125, + "learning_rate": 9.648709677419356e-05, + "loss": 0.1999, + "step": 2679 + }, + { + "epoch": 0.04288, + "grad_norm": 0.73046875, + "learning_rate": 9.648548387096774e-05, + "loss": 0.1486, + "step": 2680 + }, + { + "epoch": 0.042896, + "grad_norm": 0.80078125, + "learning_rate": 9.648387096774194e-05, + "loss": 0.1797, + "step": 2681 + }, + { + "epoch": 0.042912, + "grad_norm": 0.77734375, + "learning_rate": 9.648225806451613e-05, + "loss": 0.1863, + "step": 2682 + }, + { + "epoch": 0.042928, + "grad_norm": 0.75, + "learning_rate": 9.648064516129033e-05, + "loss": 0.1729, + "step": 2683 + }, + { + "epoch": 0.042944, + "grad_norm": 0.86328125, + "learning_rate": 9.647903225806451e-05, + "loss": 0.1929, + "step": 2684 + }, + { + "epoch": 0.04296, + "grad_norm": 0.91015625, + "learning_rate": 9.647741935483871e-05, + "loss": 0.2228, + "step": 2685 + }, + { + "epoch": 0.042976, + "grad_norm": 0.99609375, + "learning_rate": 9.64758064516129e-05, + "loss": 0.2175, + "step": 2686 + }, + { + "epoch": 0.042992, + "grad_norm": 0.73828125, + "learning_rate": 9.64741935483871e-05, + "loss": 0.1434, + "step": 2687 + }, + { + "epoch": 0.043008, + "grad_norm": 0.63671875, + "learning_rate": 9.647258064516129e-05, + "loss": 0.1329, + "step": 2688 + }, + { + "epoch": 0.043024, + "grad_norm": 1.0234375, + "learning_rate": 9.647096774193549e-05, + "loss": 0.1721, + "step": 2689 + }, + { + "epoch": 0.04304, + "grad_norm": 1.046875, + "learning_rate": 9.646935483870969e-05, + "loss": 0.1972, + "step": 2690 + }, + { + "epoch": 0.043056, + "grad_norm": 1.203125, + "learning_rate": 9.646774193548389e-05, + "loss": 0.245, + "step": 2691 + }, + { + "epoch": 0.043072, + "grad_norm": 1.328125, + "learning_rate": 9.646612903225807e-05, + "loss": 0.1788, + "step": 2692 + }, + { + "epoch": 0.043088, + "grad_norm": 1.1796875, + "learning_rate": 9.646451612903227e-05, + "loss": 0.19, + "step": 2693 + }, + { + "epoch": 0.043104, + "grad_norm": 1.34375, + "learning_rate": 9.646290322580646e-05, + "loss": 0.2039, + "step": 2694 + }, + { + "epoch": 0.04312, + "grad_norm": 1.375, + "learning_rate": 9.646129032258066e-05, + "loss": 0.1812, + "step": 2695 + }, + { + "epoch": 0.043136, + "grad_norm": 0.73046875, + "learning_rate": 9.645967741935484e-05, + "loss": 0.1534, + "step": 2696 + }, + { + "epoch": 0.043152, + "grad_norm": 0.89453125, + "learning_rate": 9.645806451612903e-05, + "loss": 0.1834, + "step": 2697 + }, + { + "epoch": 0.043168, + "grad_norm": 0.78515625, + "learning_rate": 9.645645161290323e-05, + "loss": 0.175, + "step": 2698 + }, + { + "epoch": 0.043184, + "grad_norm": 1.1953125, + "learning_rate": 9.645483870967741e-05, + "loss": 0.1824, + "step": 2699 + }, + { + "epoch": 0.0432, + "grad_norm": 1.40625, + "learning_rate": 9.645322580645161e-05, + "loss": 0.2144, + "step": 2700 + }, + { + "epoch": 0.043216, + "grad_norm": 1.078125, + "learning_rate": 9.645161290322581e-05, + "loss": 0.1851, + "step": 2701 + }, + { + "epoch": 0.043232, + "grad_norm": 1.53125, + "learning_rate": 9.645000000000001e-05, + "loss": 0.1815, + "step": 2702 + }, + { + "epoch": 0.043248, + "grad_norm": 1.0078125, + "learning_rate": 9.64483870967742e-05, + "loss": 0.1957, + "step": 2703 + }, + { + "epoch": 0.043264, + "grad_norm": 0.859375, + "learning_rate": 9.64467741935484e-05, + "loss": 0.2025, + "step": 2704 + }, + { + "epoch": 0.04328, + "grad_norm": 0.82421875, + "learning_rate": 9.644516129032259e-05, + "loss": 0.1851, + "step": 2705 + }, + { + "epoch": 0.043296, + "grad_norm": 1.140625, + "learning_rate": 9.644354838709678e-05, + "loss": 0.2297, + "step": 2706 + }, + { + "epoch": 0.043312, + "grad_norm": 1.140625, + "learning_rate": 9.644193548387097e-05, + "loss": 0.2034, + "step": 2707 + }, + { + "epoch": 0.043328, + "grad_norm": 0.75390625, + "learning_rate": 9.644032258064517e-05, + "loss": 0.1505, + "step": 2708 + }, + { + "epoch": 0.043344, + "grad_norm": 1.1171875, + "learning_rate": 9.643870967741936e-05, + "loss": 0.2084, + "step": 2709 + }, + { + "epoch": 0.04336, + "grad_norm": 0.6015625, + "learning_rate": 9.643709677419356e-05, + "loss": 0.1766, + "step": 2710 + }, + { + "epoch": 0.043376, + "grad_norm": 0.86328125, + "learning_rate": 9.643548387096774e-05, + "loss": 0.2174, + "step": 2711 + }, + { + "epoch": 0.043392, + "grad_norm": 0.9375, + "learning_rate": 9.643387096774193e-05, + "loss": 0.2005, + "step": 2712 + }, + { + "epoch": 0.043408, + "grad_norm": 0.87890625, + "learning_rate": 9.643225806451613e-05, + "loss": 0.2009, + "step": 2713 + }, + { + "epoch": 0.043424, + "grad_norm": 1.0625, + "learning_rate": 9.643064516129033e-05, + "loss": 0.1811, + "step": 2714 + }, + { + "epoch": 0.04344, + "grad_norm": 1.015625, + "learning_rate": 9.642903225806453e-05, + "loss": 0.2321, + "step": 2715 + }, + { + "epoch": 0.043456, + "grad_norm": 0.86328125, + "learning_rate": 9.642741935483871e-05, + "loss": 0.1562, + "step": 2716 + }, + { + "epoch": 0.043472, + "grad_norm": 0.71484375, + "learning_rate": 9.642580645161291e-05, + "loss": 0.1736, + "step": 2717 + }, + { + "epoch": 0.043488, + "grad_norm": 0.8671875, + "learning_rate": 9.64241935483871e-05, + "loss": 0.1945, + "step": 2718 + }, + { + "epoch": 0.043504, + "grad_norm": 0.91015625, + "learning_rate": 9.64225806451613e-05, + "loss": 0.159, + "step": 2719 + }, + { + "epoch": 0.04352, + "grad_norm": 0.69140625, + "learning_rate": 9.642096774193548e-05, + "loss": 0.166, + "step": 2720 + }, + { + "epoch": 0.043536, + "grad_norm": 0.64453125, + "learning_rate": 9.641935483870968e-05, + "loss": 0.1512, + "step": 2721 + }, + { + "epoch": 0.043552, + "grad_norm": 0.86328125, + "learning_rate": 9.641774193548387e-05, + "loss": 0.2057, + "step": 2722 + }, + { + "epoch": 0.043568, + "grad_norm": 0.99609375, + "learning_rate": 9.641612903225807e-05, + "loss": 0.1649, + "step": 2723 + }, + { + "epoch": 0.043584, + "grad_norm": 0.65625, + "learning_rate": 9.641451612903226e-05, + "loss": 0.1536, + "step": 2724 + }, + { + "epoch": 0.0436, + "grad_norm": 1.3359375, + "learning_rate": 9.641290322580646e-05, + "loss": 0.1732, + "step": 2725 + }, + { + "epoch": 0.043616, + "grad_norm": 1.15625, + "learning_rate": 9.641129032258066e-05, + "loss": 0.2138, + "step": 2726 + }, + { + "epoch": 0.043632, + "grad_norm": 0.78515625, + "learning_rate": 9.640967741935484e-05, + "loss": 0.1607, + "step": 2727 + }, + { + "epoch": 0.043648, + "grad_norm": 0.87109375, + "learning_rate": 9.640806451612904e-05, + "loss": 0.1575, + "step": 2728 + }, + { + "epoch": 0.043664, + "grad_norm": 1.28125, + "learning_rate": 9.640645161290323e-05, + "loss": 0.2036, + "step": 2729 + }, + { + "epoch": 0.04368, + "grad_norm": 1.0234375, + "learning_rate": 9.640483870967743e-05, + "loss": 0.1901, + "step": 2730 + }, + { + "epoch": 0.043696, + "grad_norm": 1.2421875, + "learning_rate": 9.640322580645161e-05, + "loss": 0.2083, + "step": 2731 + }, + { + "epoch": 0.043712, + "grad_norm": 0.7890625, + "learning_rate": 9.640161290322581e-05, + "loss": 0.1826, + "step": 2732 + }, + { + "epoch": 0.043728, + "grad_norm": 0.80859375, + "learning_rate": 9.64e-05, + "loss": 0.1781, + "step": 2733 + }, + { + "epoch": 0.043744, + "grad_norm": 0.84375, + "learning_rate": 9.63983870967742e-05, + "loss": 0.1781, + "step": 2734 + }, + { + "epoch": 0.04376, + "grad_norm": 1.0390625, + "learning_rate": 9.639677419354838e-05, + "loss": 0.2002, + "step": 2735 + }, + { + "epoch": 0.043776, + "grad_norm": 1.5546875, + "learning_rate": 9.639516129032258e-05, + "loss": 0.1988, + "step": 2736 + }, + { + "epoch": 0.043792, + "grad_norm": 0.6484375, + "learning_rate": 9.639354838709678e-05, + "loss": 0.1748, + "step": 2737 + }, + { + "epoch": 0.043808, + "grad_norm": 0.734375, + "learning_rate": 9.639193548387098e-05, + "loss": 0.1687, + "step": 2738 + }, + { + "epoch": 0.043824, + "grad_norm": 0.71875, + "learning_rate": 9.639032258064517e-05, + "loss": 0.1845, + "step": 2739 + }, + { + "epoch": 0.04384, + "grad_norm": 0.94921875, + "learning_rate": 9.638870967741937e-05, + "loss": 0.1883, + "step": 2740 + }, + { + "epoch": 0.043856, + "grad_norm": 0.90625, + "learning_rate": 9.638709677419355e-05, + "loss": 0.1876, + "step": 2741 + }, + { + "epoch": 0.043872, + "grad_norm": 0.74609375, + "learning_rate": 9.638548387096774e-05, + "loss": 0.1942, + "step": 2742 + }, + { + "epoch": 0.043888, + "grad_norm": 0.75390625, + "learning_rate": 9.638387096774194e-05, + "loss": 0.1823, + "step": 2743 + }, + { + "epoch": 0.043904, + "grad_norm": 0.96875, + "learning_rate": 9.638225806451613e-05, + "loss": 0.2198, + "step": 2744 + }, + { + "epoch": 0.04392, + "grad_norm": 1.1328125, + "learning_rate": 9.638064516129033e-05, + "loss": 0.2471, + "step": 2745 + }, + { + "epoch": 0.043936, + "grad_norm": 1.0, + "learning_rate": 9.637903225806451e-05, + "loss": 0.1586, + "step": 2746 + }, + { + "epoch": 0.043952, + "grad_norm": 1.1015625, + "learning_rate": 9.637741935483871e-05, + "loss": 0.2143, + "step": 2747 + }, + { + "epoch": 0.043968, + "grad_norm": 0.81640625, + "learning_rate": 9.63758064516129e-05, + "loss": 0.1681, + "step": 2748 + }, + { + "epoch": 0.043984, + "grad_norm": 1.15625, + "learning_rate": 9.63741935483871e-05, + "loss": 0.1942, + "step": 2749 + }, + { + "epoch": 0.044, + "grad_norm": 0.90234375, + "learning_rate": 9.63725806451613e-05, + "loss": 0.158, + "step": 2750 + }, + { + "epoch": 0.044016, + "grad_norm": 0.78515625, + "learning_rate": 9.63709677419355e-05, + "loss": 0.1551, + "step": 2751 + }, + { + "epoch": 0.044032, + "grad_norm": 1.3515625, + "learning_rate": 9.636935483870968e-05, + "loss": 0.19, + "step": 2752 + }, + { + "epoch": 0.044048, + "grad_norm": 0.94921875, + "learning_rate": 9.636774193548388e-05, + "loss": 0.2148, + "step": 2753 + }, + { + "epoch": 0.044064, + "grad_norm": 0.87109375, + "learning_rate": 9.636612903225807e-05, + "loss": 0.1869, + "step": 2754 + }, + { + "epoch": 0.04408, + "grad_norm": 1.21875, + "learning_rate": 9.636451612903227e-05, + "loss": 0.2126, + "step": 2755 + }, + { + "epoch": 0.044096, + "grad_norm": 1.140625, + "learning_rate": 9.636290322580645e-05, + "loss": 0.1924, + "step": 2756 + }, + { + "epoch": 0.044112, + "grad_norm": 0.88671875, + "learning_rate": 9.636129032258065e-05, + "loss": 0.2033, + "step": 2757 + }, + { + "epoch": 0.044128, + "grad_norm": 1.171875, + "learning_rate": 9.635967741935484e-05, + "loss": 0.2055, + "step": 2758 + }, + { + "epoch": 0.044144, + "grad_norm": 0.99609375, + "learning_rate": 9.635806451612903e-05, + "loss": 0.225, + "step": 2759 + }, + { + "epoch": 0.04416, + "grad_norm": 0.69140625, + "learning_rate": 9.635645161290323e-05, + "loss": 0.1832, + "step": 2760 + }, + { + "epoch": 0.044176, + "grad_norm": 0.82421875, + "learning_rate": 9.635483870967743e-05, + "loss": 0.1857, + "step": 2761 + }, + { + "epoch": 0.044192, + "grad_norm": 0.95703125, + "learning_rate": 9.635322580645163e-05, + "loss": 0.2198, + "step": 2762 + }, + { + "epoch": 0.044208, + "grad_norm": 0.76953125, + "learning_rate": 9.635161290322581e-05, + "loss": 0.1865, + "step": 2763 + }, + { + "epoch": 0.044224, + "grad_norm": 1.0546875, + "learning_rate": 9.635000000000001e-05, + "loss": 0.2134, + "step": 2764 + }, + { + "epoch": 0.04424, + "grad_norm": 1.1015625, + "learning_rate": 9.63483870967742e-05, + "loss": 0.1679, + "step": 2765 + }, + { + "epoch": 0.044256, + "grad_norm": 1.046875, + "learning_rate": 9.63467741935484e-05, + "loss": 0.2013, + "step": 2766 + }, + { + "epoch": 0.044272, + "grad_norm": 0.71484375, + "learning_rate": 9.634516129032258e-05, + "loss": 0.1631, + "step": 2767 + }, + { + "epoch": 0.044288, + "grad_norm": 1.0859375, + "learning_rate": 9.634354838709678e-05, + "loss": 0.1637, + "step": 2768 + }, + { + "epoch": 0.044304, + "grad_norm": 0.9921875, + "learning_rate": 9.634193548387097e-05, + "loss": 0.1861, + "step": 2769 + }, + { + "epoch": 0.04432, + "grad_norm": 0.81640625, + "learning_rate": 9.634032258064517e-05, + "loss": 0.2075, + "step": 2770 + }, + { + "epoch": 0.044336, + "grad_norm": 0.66015625, + "learning_rate": 9.633870967741935e-05, + "loss": 0.1533, + "step": 2771 + }, + { + "epoch": 0.044352, + "grad_norm": 1.2421875, + "learning_rate": 9.633709677419355e-05, + "loss": 0.1808, + "step": 2772 + }, + { + "epoch": 0.044368, + "grad_norm": 0.73828125, + "learning_rate": 9.633548387096774e-05, + "loss": 0.2111, + "step": 2773 + }, + { + "epoch": 0.044384, + "grad_norm": 1.046875, + "learning_rate": 9.633387096774194e-05, + "loss": 0.1627, + "step": 2774 + }, + { + "epoch": 0.0444, + "grad_norm": 0.83203125, + "learning_rate": 9.633225806451614e-05, + "loss": 0.1838, + "step": 2775 + }, + { + "epoch": 0.044416, + "grad_norm": 1.0859375, + "learning_rate": 9.633064516129033e-05, + "loss": 0.1638, + "step": 2776 + }, + { + "epoch": 0.044432, + "grad_norm": 0.66796875, + "learning_rate": 9.632903225806452e-05, + "loss": 0.1725, + "step": 2777 + }, + { + "epoch": 0.044448, + "grad_norm": 0.8828125, + "learning_rate": 9.632741935483871e-05, + "loss": 0.1936, + "step": 2778 + }, + { + "epoch": 0.044464, + "grad_norm": 1.0078125, + "learning_rate": 9.632580645161291e-05, + "loss": 0.222, + "step": 2779 + }, + { + "epoch": 0.04448, + "grad_norm": 0.69140625, + "learning_rate": 9.63241935483871e-05, + "loss": 0.1685, + "step": 2780 + }, + { + "epoch": 0.044496, + "grad_norm": 1.3046875, + "learning_rate": 9.63225806451613e-05, + "loss": 0.1548, + "step": 2781 + }, + { + "epoch": 0.044512, + "grad_norm": 0.8828125, + "learning_rate": 9.632096774193548e-05, + "loss": 0.192, + "step": 2782 + }, + { + "epoch": 0.044528, + "grad_norm": 0.76171875, + "learning_rate": 9.631935483870968e-05, + "loss": 0.1925, + "step": 2783 + }, + { + "epoch": 0.044544, + "grad_norm": 1.65625, + "learning_rate": 9.631774193548387e-05, + "loss": 0.2078, + "step": 2784 + }, + { + "epoch": 0.04456, + "grad_norm": 1.8203125, + "learning_rate": 9.631612903225807e-05, + "loss": 0.2385, + "step": 2785 + }, + { + "epoch": 0.044576, + "grad_norm": 0.92578125, + "learning_rate": 9.631451612903227e-05, + "loss": 0.2228, + "step": 2786 + }, + { + "epoch": 0.044592, + "grad_norm": 0.96484375, + "learning_rate": 9.631290322580647e-05, + "loss": 0.1876, + "step": 2787 + }, + { + "epoch": 0.044608, + "grad_norm": 0.56640625, + "learning_rate": 9.631129032258065e-05, + "loss": 0.1462, + "step": 2788 + }, + { + "epoch": 0.044624, + "grad_norm": 0.6171875, + "learning_rate": 9.630967741935484e-05, + "loss": 0.174, + "step": 2789 + }, + { + "epoch": 0.04464, + "grad_norm": 0.8046875, + "learning_rate": 9.630806451612904e-05, + "loss": 0.1552, + "step": 2790 + }, + { + "epoch": 0.044656, + "grad_norm": 1.3828125, + "learning_rate": 9.630645161290322e-05, + "loss": 0.1688, + "step": 2791 + }, + { + "epoch": 0.044672, + "grad_norm": 0.76171875, + "learning_rate": 9.630483870967742e-05, + "loss": 0.1643, + "step": 2792 + }, + { + "epoch": 0.044688, + "grad_norm": 0.90234375, + "learning_rate": 9.630322580645161e-05, + "loss": 0.1731, + "step": 2793 + }, + { + "epoch": 0.044704, + "grad_norm": 0.98828125, + "learning_rate": 9.630161290322581e-05, + "loss": 0.1852, + "step": 2794 + }, + { + "epoch": 0.04472, + "grad_norm": 1.421875, + "learning_rate": 9.63e-05, + "loss": 0.2295, + "step": 2795 + }, + { + "epoch": 0.044736, + "grad_norm": 1.078125, + "learning_rate": 9.62983870967742e-05, + "loss": 0.2318, + "step": 2796 + }, + { + "epoch": 0.044752, + "grad_norm": 0.6171875, + "learning_rate": 9.62967741935484e-05, + "loss": 0.1488, + "step": 2797 + }, + { + "epoch": 0.044768, + "grad_norm": 1.140625, + "learning_rate": 9.62951612903226e-05, + "loss": 0.2252, + "step": 2798 + }, + { + "epoch": 0.044784, + "grad_norm": 1.5390625, + "learning_rate": 9.629354838709678e-05, + "loss": 0.1784, + "step": 2799 + }, + { + "epoch": 0.0448, + "grad_norm": 0.796875, + "learning_rate": 9.629193548387098e-05, + "loss": 0.1979, + "step": 2800 + }, + { + "epoch": 0.044816, + "grad_norm": 0.99609375, + "learning_rate": 9.629032258064517e-05, + "loss": 0.1893, + "step": 2801 + }, + { + "epoch": 0.044832, + "grad_norm": 0.70703125, + "learning_rate": 9.628870967741937e-05, + "loss": 0.161, + "step": 2802 + }, + { + "epoch": 0.044848, + "grad_norm": 0.95703125, + "learning_rate": 9.628709677419355e-05, + "loss": 0.209, + "step": 2803 + }, + { + "epoch": 0.044864, + "grad_norm": 0.8203125, + "learning_rate": 9.628548387096775e-05, + "loss": 0.1468, + "step": 2804 + }, + { + "epoch": 0.04488, + "grad_norm": 1.15625, + "learning_rate": 9.628387096774194e-05, + "loss": 0.1863, + "step": 2805 + }, + { + "epoch": 0.044896, + "grad_norm": 0.80859375, + "learning_rate": 9.628225806451612e-05, + "loss": 0.1511, + "step": 2806 + }, + { + "epoch": 0.044912, + "grad_norm": 1.3671875, + "learning_rate": 9.628064516129032e-05, + "loss": 0.2397, + "step": 2807 + }, + { + "epoch": 0.044928, + "grad_norm": 0.828125, + "learning_rate": 9.627903225806451e-05, + "loss": 0.2429, + "step": 2808 + }, + { + "epoch": 0.044944, + "grad_norm": 0.796875, + "learning_rate": 9.627741935483871e-05, + "loss": 0.1874, + "step": 2809 + }, + { + "epoch": 0.04496, + "grad_norm": 0.7890625, + "learning_rate": 9.627580645161291e-05, + "loss": 0.1989, + "step": 2810 + }, + { + "epoch": 0.044976, + "grad_norm": 1.1953125, + "learning_rate": 9.627419354838711e-05, + "loss": 0.1882, + "step": 2811 + }, + { + "epoch": 0.044992, + "grad_norm": 0.78515625, + "learning_rate": 9.62725806451613e-05, + "loss": 0.2134, + "step": 2812 + }, + { + "epoch": 0.045008, + "grad_norm": 1.09375, + "learning_rate": 9.62709677419355e-05, + "loss": 0.2065, + "step": 2813 + }, + { + "epoch": 0.045024, + "grad_norm": 0.875, + "learning_rate": 9.626935483870968e-05, + "loss": 0.2148, + "step": 2814 + }, + { + "epoch": 0.04504, + "grad_norm": 0.796875, + "learning_rate": 9.626774193548388e-05, + "loss": 0.2036, + "step": 2815 + }, + { + "epoch": 0.045056, + "grad_norm": 0.95703125, + "learning_rate": 9.626612903225807e-05, + "loss": 0.2185, + "step": 2816 + }, + { + "epoch": 0.045072, + "grad_norm": 1.1875, + "learning_rate": 9.626451612903227e-05, + "loss": 0.1848, + "step": 2817 + }, + { + "epoch": 0.045088, + "grad_norm": 0.91796875, + "learning_rate": 9.626290322580645e-05, + "loss": 0.178, + "step": 2818 + }, + { + "epoch": 0.045104, + "grad_norm": 0.88671875, + "learning_rate": 9.626129032258065e-05, + "loss": 0.1653, + "step": 2819 + }, + { + "epoch": 0.04512, + "grad_norm": 0.82421875, + "learning_rate": 9.625967741935484e-05, + "loss": 0.1957, + "step": 2820 + }, + { + "epoch": 0.045136, + "grad_norm": 1.25, + "learning_rate": 9.625806451612904e-05, + "loss": 0.2214, + "step": 2821 + }, + { + "epoch": 0.045152, + "grad_norm": 0.66015625, + "learning_rate": 9.625645161290324e-05, + "loss": 0.1628, + "step": 2822 + }, + { + "epoch": 0.045168, + "grad_norm": 0.84765625, + "learning_rate": 9.625483870967742e-05, + "loss": 0.1949, + "step": 2823 + }, + { + "epoch": 0.045184, + "grad_norm": 1.1171875, + "learning_rate": 9.625322580645162e-05, + "loss": 0.2171, + "step": 2824 + }, + { + "epoch": 0.0452, + "grad_norm": 1.3203125, + "learning_rate": 9.625161290322581e-05, + "loss": 0.1751, + "step": 2825 + }, + { + "epoch": 0.045216, + "grad_norm": 1.4453125, + "learning_rate": 9.625000000000001e-05, + "loss": 0.2106, + "step": 2826 + }, + { + "epoch": 0.045232, + "grad_norm": 1.03125, + "learning_rate": 9.62483870967742e-05, + "loss": 0.1554, + "step": 2827 + }, + { + "epoch": 0.045248, + "grad_norm": 0.6640625, + "learning_rate": 9.62467741935484e-05, + "loss": 0.1926, + "step": 2828 + }, + { + "epoch": 0.045264, + "grad_norm": 1.2578125, + "learning_rate": 9.624516129032258e-05, + "loss": 0.1585, + "step": 2829 + }, + { + "epoch": 0.04528, + "grad_norm": 0.55859375, + "learning_rate": 9.624354838709678e-05, + "loss": 0.1643, + "step": 2830 + }, + { + "epoch": 0.045296, + "grad_norm": 1.4140625, + "learning_rate": 9.624193548387097e-05, + "loss": 0.2375, + "step": 2831 + }, + { + "epoch": 0.045312, + "grad_norm": 1.734375, + "learning_rate": 9.624032258064517e-05, + "loss": 0.2085, + "step": 2832 + }, + { + "epoch": 0.045328, + "grad_norm": 1.03125, + "learning_rate": 9.623870967741937e-05, + "loss": 0.2147, + "step": 2833 + }, + { + "epoch": 0.045344, + "grad_norm": 1.3125, + "learning_rate": 9.623709677419355e-05, + "loss": 0.1974, + "step": 2834 + }, + { + "epoch": 0.04536, + "grad_norm": 0.70703125, + "learning_rate": 9.623548387096775e-05, + "loss": 0.172, + "step": 2835 + }, + { + "epoch": 0.045376, + "grad_norm": 0.79296875, + "learning_rate": 9.623387096774194e-05, + "loss": 0.1548, + "step": 2836 + }, + { + "epoch": 0.045392, + "grad_norm": 0.91796875, + "learning_rate": 9.623225806451614e-05, + "loss": 0.1594, + "step": 2837 + }, + { + "epoch": 0.045408, + "grad_norm": 0.7890625, + "learning_rate": 9.623064516129032e-05, + "loss": 0.173, + "step": 2838 + }, + { + "epoch": 0.045424, + "grad_norm": 1.1796875, + "learning_rate": 9.622903225806452e-05, + "loss": 0.2315, + "step": 2839 + }, + { + "epoch": 0.04544, + "grad_norm": 1.6640625, + "learning_rate": 9.622741935483871e-05, + "loss": 0.241, + "step": 2840 + }, + { + "epoch": 0.045456, + "grad_norm": 0.77734375, + "learning_rate": 9.622580645161291e-05, + "loss": 0.2244, + "step": 2841 + }, + { + "epoch": 0.045472, + "grad_norm": 1.2421875, + "learning_rate": 9.62241935483871e-05, + "loss": 0.186, + "step": 2842 + }, + { + "epoch": 0.045488, + "grad_norm": 0.87109375, + "learning_rate": 9.62225806451613e-05, + "loss": 0.1683, + "step": 2843 + }, + { + "epoch": 0.045504, + "grad_norm": 0.7734375, + "learning_rate": 9.622096774193548e-05, + "loss": 0.1847, + "step": 2844 + }, + { + "epoch": 0.04552, + "grad_norm": 0.94921875, + "learning_rate": 9.621935483870968e-05, + "loss": 0.1943, + "step": 2845 + }, + { + "epoch": 0.045536, + "grad_norm": 1.015625, + "learning_rate": 9.621774193548388e-05, + "loss": 0.2383, + "step": 2846 + }, + { + "epoch": 0.045552, + "grad_norm": 1.65625, + "learning_rate": 9.621612903225808e-05, + "loss": 0.2089, + "step": 2847 + }, + { + "epoch": 0.045568, + "grad_norm": 1.5625, + "learning_rate": 9.621451612903226e-05, + "loss": 0.1572, + "step": 2848 + }, + { + "epoch": 0.045584, + "grad_norm": 1.3125, + "learning_rate": 9.621290322580646e-05, + "loss": 0.1861, + "step": 2849 + }, + { + "epoch": 0.0456, + "grad_norm": 0.9296875, + "learning_rate": 9.621129032258065e-05, + "loss": 0.1573, + "step": 2850 + }, + { + "epoch": 0.045616, + "grad_norm": 1.4375, + "learning_rate": 9.620967741935484e-05, + "loss": 0.1663, + "step": 2851 + }, + { + "epoch": 0.045632, + "grad_norm": 0.83203125, + "learning_rate": 9.620806451612904e-05, + "loss": 0.202, + "step": 2852 + }, + { + "epoch": 0.045648, + "grad_norm": 0.796875, + "learning_rate": 9.620645161290322e-05, + "loss": 0.2117, + "step": 2853 + }, + { + "epoch": 0.045664, + "grad_norm": 1.5546875, + "learning_rate": 9.620483870967742e-05, + "loss": 0.1847, + "step": 2854 + }, + { + "epoch": 0.04568, + "grad_norm": 1.390625, + "learning_rate": 9.620322580645161e-05, + "loss": 0.1904, + "step": 2855 + }, + { + "epoch": 0.045696, + "grad_norm": 1.421875, + "learning_rate": 9.620161290322581e-05, + "loss": 0.2164, + "step": 2856 + }, + { + "epoch": 0.045712, + "grad_norm": 1.2109375, + "learning_rate": 9.620000000000001e-05, + "loss": 0.1859, + "step": 2857 + }, + { + "epoch": 0.045728, + "grad_norm": 1.21875, + "learning_rate": 9.619838709677421e-05, + "loss": 0.1924, + "step": 2858 + }, + { + "epoch": 0.045744, + "grad_norm": 0.91796875, + "learning_rate": 9.619677419354839e-05, + "loss": 0.1795, + "step": 2859 + }, + { + "epoch": 0.04576, + "grad_norm": 0.76953125, + "learning_rate": 9.619516129032259e-05, + "loss": 0.1628, + "step": 2860 + }, + { + "epoch": 0.045776, + "grad_norm": 0.984375, + "learning_rate": 9.619354838709678e-05, + "loss": 0.1546, + "step": 2861 + }, + { + "epoch": 0.045792, + "grad_norm": 0.77734375, + "learning_rate": 9.619193548387098e-05, + "loss": 0.1848, + "step": 2862 + }, + { + "epoch": 0.045808, + "grad_norm": 1.046875, + "learning_rate": 9.619032258064516e-05, + "loss": 0.1502, + "step": 2863 + }, + { + "epoch": 0.045824, + "grad_norm": 1.0234375, + "learning_rate": 9.618870967741936e-05, + "loss": 0.2126, + "step": 2864 + }, + { + "epoch": 0.04584, + "grad_norm": 1.2109375, + "learning_rate": 9.618709677419355e-05, + "loss": 0.1834, + "step": 2865 + }, + { + "epoch": 0.045856, + "grad_norm": 0.76953125, + "learning_rate": 9.618548387096775e-05, + "loss": 0.1902, + "step": 2866 + }, + { + "epoch": 0.045872, + "grad_norm": 0.83203125, + "learning_rate": 9.618387096774194e-05, + "loss": 0.1656, + "step": 2867 + }, + { + "epoch": 0.045888, + "grad_norm": 1.296875, + "learning_rate": 9.618225806451612e-05, + "loss": 0.174, + "step": 2868 + }, + { + "epoch": 0.045904, + "grad_norm": 1.46875, + "learning_rate": 9.618064516129032e-05, + "loss": 0.2344, + "step": 2869 + }, + { + "epoch": 0.04592, + "grad_norm": 1.5234375, + "learning_rate": 9.617903225806452e-05, + "loss": 0.2155, + "step": 2870 + }, + { + "epoch": 0.045936, + "grad_norm": 0.88671875, + "learning_rate": 9.617741935483872e-05, + "loss": 0.1799, + "step": 2871 + }, + { + "epoch": 0.045952, + "grad_norm": 0.70703125, + "learning_rate": 9.617580645161291e-05, + "loss": 0.1945, + "step": 2872 + }, + { + "epoch": 0.045968, + "grad_norm": 0.8046875, + "learning_rate": 9.61741935483871e-05, + "loss": 0.2181, + "step": 2873 + }, + { + "epoch": 0.045984, + "grad_norm": 0.80859375, + "learning_rate": 9.617258064516129e-05, + "loss": 0.2, + "step": 2874 + }, + { + "epoch": 0.046, + "grad_norm": 1.21875, + "learning_rate": 9.617096774193549e-05, + "loss": 0.2063, + "step": 2875 + }, + { + "epoch": 0.046016, + "grad_norm": 0.640625, + "learning_rate": 9.616935483870968e-05, + "loss": 0.1826, + "step": 2876 + }, + { + "epoch": 0.046032, + "grad_norm": 0.66796875, + "learning_rate": 9.616774193548388e-05, + "loss": 0.1379, + "step": 2877 + }, + { + "epoch": 0.046048, + "grad_norm": 0.6875, + "learning_rate": 9.616612903225806e-05, + "loss": 0.1878, + "step": 2878 + }, + { + "epoch": 0.046064, + "grad_norm": 0.91796875, + "learning_rate": 9.616451612903226e-05, + "loss": 0.2311, + "step": 2879 + }, + { + "epoch": 0.04608, + "grad_norm": 0.7265625, + "learning_rate": 9.616290322580645e-05, + "loss": 0.1785, + "step": 2880 + }, + { + "epoch": 0.046096, + "grad_norm": 0.734375, + "learning_rate": 9.616129032258065e-05, + "loss": 0.1683, + "step": 2881 + }, + { + "epoch": 0.046112, + "grad_norm": 0.828125, + "learning_rate": 9.615967741935485e-05, + "loss": 0.1851, + "step": 2882 + }, + { + "epoch": 0.046128, + "grad_norm": 0.81640625, + "learning_rate": 9.615806451612904e-05, + "loss": 0.2014, + "step": 2883 + }, + { + "epoch": 0.046144, + "grad_norm": 1.0, + "learning_rate": 9.615645161290323e-05, + "loss": 0.2074, + "step": 2884 + }, + { + "epoch": 0.04616, + "grad_norm": 1.203125, + "learning_rate": 9.615483870967742e-05, + "loss": 0.2117, + "step": 2885 + }, + { + "epoch": 0.046176, + "grad_norm": 0.796875, + "learning_rate": 9.615322580645162e-05, + "loss": 0.1856, + "step": 2886 + }, + { + "epoch": 0.046192, + "grad_norm": 1.0859375, + "learning_rate": 9.61516129032258e-05, + "loss": 0.2167, + "step": 2887 + }, + { + "epoch": 0.046208, + "grad_norm": 0.98828125, + "learning_rate": 9.615e-05, + "loss": 0.2537, + "step": 2888 + }, + { + "epoch": 0.046224, + "grad_norm": 0.9609375, + "learning_rate": 9.614838709677419e-05, + "loss": 0.223, + "step": 2889 + }, + { + "epoch": 0.04624, + "grad_norm": 0.81640625, + "learning_rate": 9.614677419354839e-05, + "loss": 0.1762, + "step": 2890 + }, + { + "epoch": 0.046256, + "grad_norm": 0.984375, + "learning_rate": 9.614516129032258e-05, + "loss": 0.1483, + "step": 2891 + }, + { + "epoch": 0.046272, + "grad_norm": 1.1171875, + "learning_rate": 9.614354838709678e-05, + "loss": 0.2237, + "step": 2892 + }, + { + "epoch": 0.046288, + "grad_norm": 0.83203125, + "learning_rate": 9.614193548387098e-05, + "loss": 0.1822, + "step": 2893 + }, + { + "epoch": 0.046304, + "grad_norm": 0.81640625, + "learning_rate": 9.614032258064518e-05, + "loss": 0.1703, + "step": 2894 + }, + { + "epoch": 0.04632, + "grad_norm": 0.69921875, + "learning_rate": 9.613870967741936e-05, + "loss": 0.1997, + "step": 2895 + }, + { + "epoch": 0.046336, + "grad_norm": 0.71484375, + "learning_rate": 9.613709677419356e-05, + "loss": 0.1636, + "step": 2896 + }, + { + "epoch": 0.046352, + "grad_norm": 1.015625, + "learning_rate": 9.613548387096775e-05, + "loss": 0.2189, + "step": 2897 + }, + { + "epoch": 0.046368, + "grad_norm": 0.61328125, + "learning_rate": 9.613387096774193e-05, + "loss": 0.1641, + "step": 2898 + }, + { + "epoch": 0.046384, + "grad_norm": 0.66015625, + "learning_rate": 9.613225806451613e-05, + "loss": 0.1835, + "step": 2899 + }, + { + "epoch": 0.0464, + "grad_norm": 0.98828125, + "learning_rate": 9.613064516129032e-05, + "loss": 0.1699, + "step": 2900 + }, + { + "epoch": 0.046416, + "grad_norm": 0.7578125, + "learning_rate": 9.612903225806452e-05, + "loss": 0.1724, + "step": 2901 + }, + { + "epoch": 0.046432, + "grad_norm": 0.8828125, + "learning_rate": 9.61274193548387e-05, + "loss": 0.1885, + "step": 2902 + }, + { + "epoch": 0.046448, + "grad_norm": 0.8359375, + "learning_rate": 9.61258064516129e-05, + "loss": 0.165, + "step": 2903 + }, + { + "epoch": 0.046464, + "grad_norm": 0.88671875, + "learning_rate": 9.612419354838709e-05, + "loss": 0.165, + "step": 2904 + }, + { + "epoch": 0.04648, + "grad_norm": 0.90625, + "learning_rate": 9.612258064516129e-05, + "loss": 0.168, + "step": 2905 + }, + { + "epoch": 0.046496, + "grad_norm": 0.82421875, + "learning_rate": 9.612096774193549e-05, + "loss": 0.2191, + "step": 2906 + }, + { + "epoch": 0.046512, + "grad_norm": 1.1484375, + "learning_rate": 9.611935483870969e-05, + "loss": 0.2549, + "step": 2907 + }, + { + "epoch": 0.046528, + "grad_norm": 0.78515625, + "learning_rate": 9.611774193548388e-05, + "loss": 0.2054, + "step": 2908 + }, + { + "epoch": 0.046544, + "grad_norm": 0.96875, + "learning_rate": 9.611612903225808e-05, + "loss": 0.1818, + "step": 2909 + }, + { + "epoch": 0.04656, + "grad_norm": 0.9296875, + "learning_rate": 9.611451612903226e-05, + "loss": 0.1662, + "step": 2910 + }, + { + "epoch": 0.046576, + "grad_norm": 0.7421875, + "learning_rate": 9.611290322580646e-05, + "loss": 0.1646, + "step": 2911 + }, + { + "epoch": 0.046592, + "grad_norm": 0.859375, + "learning_rate": 9.611129032258065e-05, + "loss": 0.1449, + "step": 2912 + }, + { + "epoch": 0.046608, + "grad_norm": 1.046875, + "learning_rate": 9.610967741935485e-05, + "loss": 0.2027, + "step": 2913 + }, + { + "epoch": 0.046624, + "grad_norm": 0.7265625, + "learning_rate": 9.610806451612903e-05, + "loss": 0.1541, + "step": 2914 + }, + { + "epoch": 0.04664, + "grad_norm": 1.2109375, + "learning_rate": 9.610645161290322e-05, + "loss": 0.2178, + "step": 2915 + }, + { + "epoch": 0.046656, + "grad_norm": 0.8203125, + "learning_rate": 9.610483870967742e-05, + "loss": 0.1567, + "step": 2916 + }, + { + "epoch": 0.046672, + "grad_norm": 1.0859375, + "learning_rate": 9.610322580645162e-05, + "loss": 0.215, + "step": 2917 + }, + { + "epoch": 0.046688, + "grad_norm": 1.1015625, + "learning_rate": 9.610161290322582e-05, + "loss": 0.1889, + "step": 2918 + }, + { + "epoch": 0.046704, + "grad_norm": 0.79296875, + "learning_rate": 9.61e-05, + "loss": 0.2138, + "step": 2919 + }, + { + "epoch": 0.04672, + "grad_norm": 0.8203125, + "learning_rate": 9.60983870967742e-05, + "loss": 0.1688, + "step": 2920 + }, + { + "epoch": 0.046736, + "grad_norm": 0.83984375, + "learning_rate": 9.609677419354839e-05, + "loss": 0.1904, + "step": 2921 + }, + { + "epoch": 0.046752, + "grad_norm": 0.78515625, + "learning_rate": 9.609516129032259e-05, + "loss": 0.1886, + "step": 2922 + }, + { + "epoch": 0.046768, + "grad_norm": 1.171875, + "learning_rate": 9.609354838709678e-05, + "loss": 0.2174, + "step": 2923 + }, + { + "epoch": 0.046784, + "grad_norm": 0.8984375, + "learning_rate": 9.609193548387098e-05, + "loss": 0.2325, + "step": 2924 + }, + { + "epoch": 0.0468, + "grad_norm": 0.80078125, + "learning_rate": 9.609032258064516e-05, + "loss": 0.1685, + "step": 2925 + }, + { + "epoch": 0.046816, + "grad_norm": 0.78125, + "learning_rate": 9.608870967741936e-05, + "loss": 0.1733, + "step": 2926 + }, + { + "epoch": 0.046832, + "grad_norm": 0.77734375, + "learning_rate": 9.608709677419355e-05, + "loss": 0.1817, + "step": 2927 + }, + { + "epoch": 0.046848, + "grad_norm": 1.0859375, + "learning_rate": 9.608548387096775e-05, + "loss": 0.2003, + "step": 2928 + }, + { + "epoch": 0.046864, + "grad_norm": 0.69140625, + "learning_rate": 9.608387096774195e-05, + "loss": 0.1585, + "step": 2929 + }, + { + "epoch": 0.04688, + "grad_norm": 0.89453125, + "learning_rate": 9.608225806451613e-05, + "loss": 0.2096, + "step": 2930 + }, + { + "epoch": 0.046896, + "grad_norm": 1.046875, + "learning_rate": 9.608064516129033e-05, + "loss": 0.1856, + "step": 2931 + }, + { + "epoch": 0.046912, + "grad_norm": 1.59375, + "learning_rate": 9.607903225806452e-05, + "loss": 0.2367, + "step": 2932 + }, + { + "epoch": 0.046928, + "grad_norm": 0.8515625, + "learning_rate": 9.607741935483872e-05, + "loss": 0.1846, + "step": 2933 + }, + { + "epoch": 0.046944, + "grad_norm": 0.61328125, + "learning_rate": 9.60758064516129e-05, + "loss": 0.1528, + "step": 2934 + }, + { + "epoch": 0.04696, + "grad_norm": 0.72265625, + "learning_rate": 9.60741935483871e-05, + "loss": 0.1926, + "step": 2935 + }, + { + "epoch": 0.046976, + "grad_norm": 0.8046875, + "learning_rate": 9.607258064516129e-05, + "loss": 0.2191, + "step": 2936 + }, + { + "epoch": 0.046992, + "grad_norm": 0.6875, + "learning_rate": 9.607096774193549e-05, + "loss": 0.1742, + "step": 2937 + }, + { + "epoch": 0.047008, + "grad_norm": 0.765625, + "learning_rate": 9.606935483870968e-05, + "loss": 0.16, + "step": 2938 + }, + { + "epoch": 0.047024, + "grad_norm": 0.7578125, + "learning_rate": 9.606774193548388e-05, + "loss": 0.168, + "step": 2939 + }, + { + "epoch": 0.04704, + "grad_norm": 0.91796875, + "learning_rate": 9.606612903225806e-05, + "loss": 0.1577, + "step": 2940 + }, + { + "epoch": 0.047056, + "grad_norm": 1.1171875, + "learning_rate": 9.606451612903226e-05, + "loss": 0.1633, + "step": 2941 + }, + { + "epoch": 0.047072, + "grad_norm": 0.76953125, + "learning_rate": 9.606290322580646e-05, + "loss": 0.1705, + "step": 2942 + }, + { + "epoch": 0.047088, + "grad_norm": 0.77734375, + "learning_rate": 9.606129032258066e-05, + "loss": 0.1577, + "step": 2943 + }, + { + "epoch": 0.047104, + "grad_norm": 0.71484375, + "learning_rate": 9.605967741935485e-05, + "loss": 0.1908, + "step": 2944 + }, + { + "epoch": 0.04712, + "grad_norm": 0.921875, + "learning_rate": 9.605806451612903e-05, + "loss": 0.2032, + "step": 2945 + }, + { + "epoch": 0.047136, + "grad_norm": 0.6328125, + "learning_rate": 9.605645161290323e-05, + "loss": 0.1544, + "step": 2946 + }, + { + "epoch": 0.047152, + "grad_norm": 0.92578125, + "learning_rate": 9.605483870967742e-05, + "loss": 0.2147, + "step": 2947 + }, + { + "epoch": 0.047168, + "grad_norm": 1.5859375, + "learning_rate": 9.605322580645162e-05, + "loss": 0.2061, + "step": 2948 + }, + { + "epoch": 0.047184, + "grad_norm": 1.328125, + "learning_rate": 9.60516129032258e-05, + "loss": 0.1918, + "step": 2949 + }, + { + "epoch": 0.0472, + "grad_norm": 0.9609375, + "learning_rate": 9.605e-05, + "loss": 0.1916, + "step": 2950 + }, + { + "epoch": 0.047216, + "grad_norm": 1.265625, + "learning_rate": 9.604838709677419e-05, + "loss": 0.1966, + "step": 2951 + }, + { + "epoch": 0.047232, + "grad_norm": 1.2734375, + "learning_rate": 9.604677419354839e-05, + "loss": 0.2084, + "step": 2952 + }, + { + "epoch": 0.047248, + "grad_norm": 0.89453125, + "learning_rate": 9.604516129032259e-05, + "loss": 0.1896, + "step": 2953 + }, + { + "epoch": 0.047264, + "grad_norm": 0.75390625, + "learning_rate": 9.604354838709679e-05, + "loss": 0.202, + "step": 2954 + }, + { + "epoch": 0.04728, + "grad_norm": 0.7890625, + "learning_rate": 9.604193548387097e-05, + "loss": 0.1849, + "step": 2955 + }, + { + "epoch": 0.047296, + "grad_norm": 0.74609375, + "learning_rate": 9.604032258064517e-05, + "loss": 0.1871, + "step": 2956 + }, + { + "epoch": 0.047312, + "grad_norm": 0.9375, + "learning_rate": 9.603870967741936e-05, + "loss": 0.1719, + "step": 2957 + }, + { + "epoch": 0.047328, + "grad_norm": 1.03125, + "learning_rate": 9.603709677419356e-05, + "loss": 0.1538, + "step": 2958 + }, + { + "epoch": 0.047344, + "grad_norm": 1.453125, + "learning_rate": 9.603548387096775e-05, + "loss": 0.1836, + "step": 2959 + }, + { + "epoch": 0.04736, + "grad_norm": 0.65625, + "learning_rate": 9.603387096774193e-05, + "loss": 0.1862, + "step": 2960 + }, + { + "epoch": 0.047376, + "grad_norm": 0.81640625, + "learning_rate": 9.603225806451613e-05, + "loss": 0.1643, + "step": 2961 + }, + { + "epoch": 0.047392, + "grad_norm": 0.75, + "learning_rate": 9.603064516129032e-05, + "loss": 0.1578, + "step": 2962 + }, + { + "epoch": 0.047408, + "grad_norm": 1.03125, + "learning_rate": 9.602903225806452e-05, + "loss": 0.2027, + "step": 2963 + }, + { + "epoch": 0.047424, + "grad_norm": 0.91015625, + "learning_rate": 9.60274193548387e-05, + "loss": 0.1989, + "step": 2964 + }, + { + "epoch": 0.04744, + "grad_norm": 1.2109375, + "learning_rate": 9.60258064516129e-05, + "loss": 0.2531, + "step": 2965 + }, + { + "epoch": 0.047456, + "grad_norm": 1.5546875, + "learning_rate": 9.60241935483871e-05, + "loss": 0.1934, + "step": 2966 + }, + { + "epoch": 0.047472, + "grad_norm": 0.64453125, + "learning_rate": 9.60225806451613e-05, + "loss": 0.1819, + "step": 2967 + }, + { + "epoch": 0.047488, + "grad_norm": 1.6796875, + "learning_rate": 9.602096774193549e-05, + "loss": 0.1913, + "step": 2968 + }, + { + "epoch": 0.047504, + "grad_norm": 0.796875, + "learning_rate": 9.601935483870969e-05, + "loss": 0.1891, + "step": 2969 + }, + { + "epoch": 0.04752, + "grad_norm": 0.87109375, + "learning_rate": 9.601774193548387e-05, + "loss": 0.1826, + "step": 2970 + }, + { + "epoch": 0.047536, + "grad_norm": 0.9140625, + "learning_rate": 9.601612903225807e-05, + "loss": 0.1706, + "step": 2971 + }, + { + "epoch": 0.047552, + "grad_norm": 1.0546875, + "learning_rate": 9.601451612903226e-05, + "loss": 0.1566, + "step": 2972 + }, + { + "epoch": 0.047568, + "grad_norm": 0.86328125, + "learning_rate": 9.601290322580646e-05, + "loss": 0.1888, + "step": 2973 + }, + { + "epoch": 0.047584, + "grad_norm": 1.109375, + "learning_rate": 9.601129032258065e-05, + "loss": 0.1938, + "step": 2974 + }, + { + "epoch": 0.0476, + "grad_norm": 0.87890625, + "learning_rate": 9.600967741935485e-05, + "loss": 0.1818, + "step": 2975 + }, + { + "epoch": 0.047616, + "grad_norm": 0.7890625, + "learning_rate": 9.600806451612903e-05, + "loss": 0.2394, + "step": 2976 + }, + { + "epoch": 0.047632, + "grad_norm": 1.296875, + "learning_rate": 9.600645161290323e-05, + "loss": 0.2246, + "step": 2977 + }, + { + "epoch": 0.047648, + "grad_norm": 1.609375, + "learning_rate": 9.600483870967743e-05, + "loss": 0.1906, + "step": 2978 + }, + { + "epoch": 0.047664, + "grad_norm": 0.96875, + "learning_rate": 9.600322580645162e-05, + "loss": 0.1725, + "step": 2979 + }, + { + "epoch": 0.04768, + "grad_norm": 0.90625, + "learning_rate": 9.600161290322582e-05, + "loss": 0.139, + "step": 2980 + }, + { + "epoch": 0.047696, + "grad_norm": 0.91796875, + "learning_rate": 9.6e-05, + "loss": 0.1736, + "step": 2981 + }, + { + "epoch": 0.047712, + "grad_norm": 1.0234375, + "learning_rate": 9.59983870967742e-05, + "loss": 0.1988, + "step": 2982 + }, + { + "epoch": 0.047728, + "grad_norm": 0.87109375, + "learning_rate": 9.599677419354839e-05, + "loss": 0.2156, + "step": 2983 + }, + { + "epoch": 0.047744, + "grad_norm": 1.265625, + "learning_rate": 9.599516129032259e-05, + "loss": 0.1951, + "step": 2984 + }, + { + "epoch": 0.04776, + "grad_norm": 1.0234375, + "learning_rate": 9.599354838709677e-05, + "loss": 0.1662, + "step": 2985 + }, + { + "epoch": 0.047776, + "grad_norm": 0.93359375, + "learning_rate": 9.599193548387097e-05, + "loss": 0.1749, + "step": 2986 + }, + { + "epoch": 0.047792, + "grad_norm": 0.80859375, + "learning_rate": 9.599032258064516e-05, + "loss": 0.1619, + "step": 2987 + }, + { + "epoch": 0.047808, + "grad_norm": 0.6015625, + "learning_rate": 9.598870967741936e-05, + "loss": 0.1667, + "step": 2988 + }, + { + "epoch": 0.047824, + "grad_norm": 0.96484375, + "learning_rate": 9.598709677419356e-05, + "loss": 0.188, + "step": 2989 + }, + { + "epoch": 0.04784, + "grad_norm": 0.5390625, + "learning_rate": 9.598548387096776e-05, + "loss": 0.1364, + "step": 2990 + }, + { + "epoch": 0.047856, + "grad_norm": 1.6015625, + "learning_rate": 9.598387096774194e-05, + "loss": 0.2516, + "step": 2991 + }, + { + "epoch": 0.047872, + "grad_norm": 0.97265625, + "learning_rate": 9.598225806451613e-05, + "loss": 0.1682, + "step": 2992 + }, + { + "epoch": 0.047888, + "grad_norm": 0.97265625, + "learning_rate": 9.598064516129033e-05, + "loss": 0.259, + "step": 2993 + }, + { + "epoch": 0.047904, + "grad_norm": 0.67578125, + "learning_rate": 9.597903225806452e-05, + "loss": 0.1802, + "step": 2994 + }, + { + "epoch": 0.04792, + "grad_norm": 0.78125, + "learning_rate": 9.597741935483872e-05, + "loss": 0.1776, + "step": 2995 + }, + { + "epoch": 0.047936, + "grad_norm": 0.90234375, + "learning_rate": 9.59758064516129e-05, + "loss": 0.2207, + "step": 2996 + }, + { + "epoch": 0.047952, + "grad_norm": 0.765625, + "learning_rate": 9.59741935483871e-05, + "loss": 0.1744, + "step": 2997 + }, + { + "epoch": 0.047968, + "grad_norm": 1.390625, + "learning_rate": 9.597258064516129e-05, + "loss": 0.1954, + "step": 2998 + }, + { + "epoch": 0.047984, + "grad_norm": 0.88671875, + "learning_rate": 9.597096774193549e-05, + "loss": 0.1954, + "step": 2999 + }, + { + "epoch": 0.048, + "grad_norm": 1.4296875, + "learning_rate": 9.596935483870967e-05, + "loss": 0.168, + "step": 3000 + }, + { + "epoch": 0.048016, + "grad_norm": 1.1015625, + "learning_rate": 9.596774193548387e-05, + "loss": 0.1775, + "step": 3001 + }, + { + "epoch": 0.048032, + "grad_norm": 0.765625, + "learning_rate": 9.596612903225807e-05, + "loss": 0.1668, + "step": 3002 + }, + { + "epoch": 0.048048, + "grad_norm": 1.140625, + "learning_rate": 9.596451612903227e-05, + "loss": 0.162, + "step": 3003 + }, + { + "epoch": 0.048064, + "grad_norm": 0.765625, + "learning_rate": 9.596290322580646e-05, + "loss": 0.2113, + "step": 3004 + }, + { + "epoch": 0.04808, + "grad_norm": 1.5234375, + "learning_rate": 9.596129032258066e-05, + "loss": 0.2058, + "step": 3005 + }, + { + "epoch": 0.048096, + "grad_norm": 1.140625, + "learning_rate": 9.595967741935484e-05, + "loss": 0.19, + "step": 3006 + }, + { + "epoch": 0.048112, + "grad_norm": 1.59375, + "learning_rate": 9.595806451612903e-05, + "loss": 0.1858, + "step": 3007 + }, + { + "epoch": 0.048128, + "grad_norm": 0.96875, + "learning_rate": 9.595645161290323e-05, + "loss": 0.1768, + "step": 3008 + }, + { + "epoch": 0.048144, + "grad_norm": 0.73046875, + "learning_rate": 9.595483870967742e-05, + "loss": 0.1826, + "step": 3009 + }, + { + "epoch": 0.04816, + "grad_norm": 1.109375, + "learning_rate": 9.595322580645162e-05, + "loss": 0.1781, + "step": 3010 + }, + { + "epoch": 0.048176, + "grad_norm": 0.66796875, + "learning_rate": 9.59516129032258e-05, + "loss": 0.1582, + "step": 3011 + }, + { + "epoch": 0.048192, + "grad_norm": 0.8203125, + "learning_rate": 9.595e-05, + "loss": 0.1996, + "step": 3012 + }, + { + "epoch": 0.048208, + "grad_norm": 1.2109375, + "learning_rate": 9.59483870967742e-05, + "loss": 0.2472, + "step": 3013 + }, + { + "epoch": 0.048224, + "grad_norm": 1.734375, + "learning_rate": 9.59467741935484e-05, + "loss": 0.1868, + "step": 3014 + }, + { + "epoch": 0.04824, + "grad_norm": 0.703125, + "learning_rate": 9.594516129032259e-05, + "loss": 0.1947, + "step": 3015 + }, + { + "epoch": 0.048256, + "grad_norm": 1.2578125, + "learning_rate": 9.594354838709679e-05, + "loss": 0.1854, + "step": 3016 + }, + { + "epoch": 0.048272, + "grad_norm": 0.62109375, + "learning_rate": 9.594193548387097e-05, + "loss": 0.1851, + "step": 3017 + }, + { + "epoch": 0.048288, + "grad_norm": 1.046875, + "learning_rate": 9.594032258064517e-05, + "loss": 0.1843, + "step": 3018 + }, + { + "epoch": 0.048304, + "grad_norm": 1.1015625, + "learning_rate": 9.593870967741936e-05, + "loss": 0.1976, + "step": 3019 + }, + { + "epoch": 0.04832, + "grad_norm": 0.84765625, + "learning_rate": 9.593709677419356e-05, + "loss": 0.1576, + "step": 3020 + }, + { + "epoch": 0.048336, + "grad_norm": 0.8125, + "learning_rate": 9.593548387096774e-05, + "loss": 0.2033, + "step": 3021 + }, + { + "epoch": 0.048352, + "grad_norm": 0.72265625, + "learning_rate": 9.593387096774194e-05, + "loss": 0.1804, + "step": 3022 + }, + { + "epoch": 0.048368, + "grad_norm": 0.84375, + "learning_rate": 9.593225806451613e-05, + "loss": 0.1542, + "step": 3023 + }, + { + "epoch": 0.048384, + "grad_norm": 0.921875, + "learning_rate": 9.593064516129033e-05, + "loss": 0.2051, + "step": 3024 + }, + { + "epoch": 0.0484, + "grad_norm": 1.1171875, + "learning_rate": 9.592903225806452e-05, + "loss": 0.16, + "step": 3025 + }, + { + "epoch": 0.048416, + "grad_norm": 0.67578125, + "learning_rate": 9.592741935483871e-05, + "loss": 0.1945, + "step": 3026 + }, + { + "epoch": 0.048432, + "grad_norm": 0.80078125, + "learning_rate": 9.592580645161291e-05, + "loss": 0.1912, + "step": 3027 + }, + { + "epoch": 0.048448, + "grad_norm": 0.76171875, + "learning_rate": 9.59241935483871e-05, + "loss": 0.1806, + "step": 3028 + }, + { + "epoch": 0.048464, + "grad_norm": 0.73046875, + "learning_rate": 9.59225806451613e-05, + "loss": 0.1649, + "step": 3029 + }, + { + "epoch": 0.04848, + "grad_norm": 0.86328125, + "learning_rate": 9.592096774193549e-05, + "loss": 0.1709, + "step": 3030 + }, + { + "epoch": 0.048496, + "grad_norm": 1.5625, + "learning_rate": 9.591935483870969e-05, + "loss": 0.1815, + "step": 3031 + }, + { + "epoch": 0.048512, + "grad_norm": 1.1015625, + "learning_rate": 9.591774193548387e-05, + "loss": 0.1883, + "step": 3032 + }, + { + "epoch": 0.048528, + "grad_norm": 1.4296875, + "learning_rate": 9.591612903225807e-05, + "loss": 0.2123, + "step": 3033 + }, + { + "epoch": 0.048544, + "grad_norm": 0.94140625, + "learning_rate": 9.591451612903226e-05, + "loss": 0.2101, + "step": 3034 + }, + { + "epoch": 0.04856, + "grad_norm": 1.734375, + "learning_rate": 9.591290322580646e-05, + "loss": 0.1885, + "step": 3035 + }, + { + "epoch": 0.048576, + "grad_norm": 0.48828125, + "learning_rate": 9.591129032258064e-05, + "loss": 0.1654, + "step": 3036 + }, + { + "epoch": 0.048592, + "grad_norm": 0.75, + "learning_rate": 9.590967741935484e-05, + "loss": 0.1721, + "step": 3037 + }, + { + "epoch": 0.048608, + "grad_norm": 0.7734375, + "learning_rate": 9.590806451612904e-05, + "loss": 0.2043, + "step": 3038 + }, + { + "epoch": 0.048624, + "grad_norm": 0.890625, + "learning_rate": 9.590645161290323e-05, + "loss": 0.1706, + "step": 3039 + }, + { + "epoch": 0.04864, + "grad_norm": 0.61328125, + "learning_rate": 9.590483870967743e-05, + "loss": 0.1557, + "step": 3040 + }, + { + "epoch": 0.048656, + "grad_norm": 0.98828125, + "learning_rate": 9.590322580645161e-05, + "loss": 0.2304, + "step": 3041 + }, + { + "epoch": 0.048672, + "grad_norm": 0.87109375, + "learning_rate": 9.590161290322581e-05, + "loss": 0.1904, + "step": 3042 + }, + { + "epoch": 0.048688, + "grad_norm": 1.3203125, + "learning_rate": 9.59e-05, + "loss": 0.1923, + "step": 3043 + }, + { + "epoch": 0.048704, + "grad_norm": 0.59765625, + "learning_rate": 9.58983870967742e-05, + "loss": 0.1702, + "step": 3044 + }, + { + "epoch": 0.04872, + "grad_norm": 1.125, + "learning_rate": 9.589677419354839e-05, + "loss": 0.1685, + "step": 3045 + }, + { + "epoch": 0.048736, + "grad_norm": 0.87109375, + "learning_rate": 9.589516129032259e-05, + "loss": 0.1779, + "step": 3046 + }, + { + "epoch": 0.048752, + "grad_norm": 0.68359375, + "learning_rate": 9.589354838709677e-05, + "loss": 0.1957, + "step": 3047 + }, + { + "epoch": 0.048768, + "grad_norm": 1.0, + "learning_rate": 9.589193548387097e-05, + "loss": 0.2087, + "step": 3048 + }, + { + "epoch": 0.048784, + "grad_norm": 0.83203125, + "learning_rate": 9.589032258064517e-05, + "loss": 0.1652, + "step": 3049 + }, + { + "epoch": 0.0488, + "grad_norm": 0.9921875, + "learning_rate": 9.588870967741937e-05, + "loss": 0.1962, + "step": 3050 + }, + { + "epoch": 0.048816, + "grad_norm": 0.7265625, + "learning_rate": 9.588709677419356e-05, + "loss": 0.1621, + "step": 3051 + }, + { + "epoch": 0.048832, + "grad_norm": 0.80078125, + "learning_rate": 9.588548387096776e-05, + "loss": 0.1711, + "step": 3052 + }, + { + "epoch": 0.048848, + "grad_norm": 0.9453125, + "learning_rate": 9.588387096774194e-05, + "loss": 0.1934, + "step": 3053 + }, + { + "epoch": 0.048864, + "grad_norm": 1.1484375, + "learning_rate": 9.588225806451613e-05, + "loss": 0.182, + "step": 3054 + }, + { + "epoch": 0.04888, + "grad_norm": 0.7734375, + "learning_rate": 9.588064516129033e-05, + "loss": 0.1858, + "step": 3055 + }, + { + "epoch": 0.048896, + "grad_norm": 1.0078125, + "learning_rate": 9.587903225806451e-05, + "loss": 0.2309, + "step": 3056 + }, + { + "epoch": 0.048912, + "grad_norm": 0.91015625, + "learning_rate": 9.587741935483871e-05, + "loss": 0.2139, + "step": 3057 + }, + { + "epoch": 0.048928, + "grad_norm": 0.6953125, + "learning_rate": 9.58758064516129e-05, + "loss": 0.1632, + "step": 3058 + }, + { + "epoch": 0.048944, + "grad_norm": 0.72265625, + "learning_rate": 9.58741935483871e-05, + "loss": 0.1909, + "step": 3059 + }, + { + "epoch": 0.04896, + "grad_norm": 1.0625, + "learning_rate": 9.587258064516129e-05, + "loss": 0.2563, + "step": 3060 + }, + { + "epoch": 0.048976, + "grad_norm": 0.640625, + "learning_rate": 9.587096774193548e-05, + "loss": 0.1752, + "step": 3061 + }, + { + "epoch": 0.048992, + "grad_norm": 1.328125, + "learning_rate": 9.586935483870968e-05, + "loss": 0.1978, + "step": 3062 + }, + { + "epoch": 0.049008, + "grad_norm": 0.78125, + "learning_rate": 9.586774193548388e-05, + "loss": 0.166, + "step": 3063 + }, + { + "epoch": 0.049024, + "grad_norm": 1.0, + "learning_rate": 9.586612903225807e-05, + "loss": 0.1969, + "step": 3064 + }, + { + "epoch": 0.04904, + "grad_norm": 0.8203125, + "learning_rate": 9.586451612903227e-05, + "loss": 0.1863, + "step": 3065 + }, + { + "epoch": 0.049056, + "grad_norm": 1.375, + "learning_rate": 9.586290322580646e-05, + "loss": 0.2265, + "step": 3066 + }, + { + "epoch": 0.049072, + "grad_norm": 1.484375, + "learning_rate": 9.586129032258066e-05, + "loss": 0.204, + "step": 3067 + }, + { + "epoch": 0.049088, + "grad_norm": 1.359375, + "learning_rate": 9.585967741935484e-05, + "loss": 0.1859, + "step": 3068 + }, + { + "epoch": 0.049104, + "grad_norm": 1.078125, + "learning_rate": 9.585806451612903e-05, + "loss": 0.2041, + "step": 3069 + }, + { + "epoch": 0.04912, + "grad_norm": 1.171875, + "learning_rate": 9.585645161290323e-05, + "loss": 0.2137, + "step": 3070 + }, + { + "epoch": 0.049136, + "grad_norm": 1.015625, + "learning_rate": 9.585483870967741e-05, + "loss": 0.2523, + "step": 3071 + }, + { + "epoch": 0.049152, + "grad_norm": 0.74609375, + "learning_rate": 9.585322580645161e-05, + "loss": 0.1758, + "step": 3072 + }, + { + "epoch": 0.049168, + "grad_norm": 1.7421875, + "learning_rate": 9.585161290322581e-05, + "loss": 0.1602, + "step": 3073 + }, + { + "epoch": 0.049184, + "grad_norm": 1.0625, + "learning_rate": 9.585000000000001e-05, + "loss": 0.1965, + "step": 3074 + }, + { + "epoch": 0.0492, + "grad_norm": 1.875, + "learning_rate": 9.58483870967742e-05, + "loss": 0.2018, + "step": 3075 + }, + { + "epoch": 0.049216, + "grad_norm": 1.6484375, + "learning_rate": 9.58467741935484e-05, + "loss": 0.2117, + "step": 3076 + }, + { + "epoch": 0.049232, + "grad_norm": 1.109375, + "learning_rate": 9.584516129032258e-05, + "loss": 0.2018, + "step": 3077 + }, + { + "epoch": 0.049248, + "grad_norm": 1.0625, + "learning_rate": 9.584354838709678e-05, + "loss": 0.2308, + "step": 3078 + }, + { + "epoch": 0.049264, + "grad_norm": 1.1171875, + "learning_rate": 9.584193548387097e-05, + "loss": 0.2037, + "step": 3079 + }, + { + "epoch": 0.04928, + "grad_norm": 0.84375, + "learning_rate": 9.584032258064517e-05, + "loss": 0.1906, + "step": 3080 + }, + { + "epoch": 0.049296, + "grad_norm": 0.984375, + "learning_rate": 9.583870967741936e-05, + "loss": 0.1842, + "step": 3081 + }, + { + "epoch": 0.049312, + "grad_norm": 0.69921875, + "learning_rate": 9.583709677419356e-05, + "loss": 0.1531, + "step": 3082 + }, + { + "epoch": 0.049328, + "grad_norm": 1.0234375, + "learning_rate": 9.583548387096774e-05, + "loss": 0.1954, + "step": 3083 + }, + { + "epoch": 0.049344, + "grad_norm": 0.81640625, + "learning_rate": 9.583387096774194e-05, + "loss": 0.1839, + "step": 3084 + }, + { + "epoch": 0.04936, + "grad_norm": 1.3515625, + "learning_rate": 9.583225806451614e-05, + "loss": 0.1654, + "step": 3085 + }, + { + "epoch": 0.049376, + "grad_norm": 0.890625, + "learning_rate": 9.583064516129033e-05, + "loss": 0.1922, + "step": 3086 + }, + { + "epoch": 0.049392, + "grad_norm": 1.25, + "learning_rate": 9.582903225806453e-05, + "loss": 0.185, + "step": 3087 + }, + { + "epoch": 0.049408, + "grad_norm": 0.609375, + "learning_rate": 9.582741935483871e-05, + "loss": 0.1597, + "step": 3088 + }, + { + "epoch": 0.049424, + "grad_norm": 0.62890625, + "learning_rate": 9.582580645161291e-05, + "loss": 0.1541, + "step": 3089 + }, + { + "epoch": 0.04944, + "grad_norm": 0.80859375, + "learning_rate": 9.58241935483871e-05, + "loss": 0.1895, + "step": 3090 + }, + { + "epoch": 0.049456, + "grad_norm": 0.75390625, + "learning_rate": 9.58225806451613e-05, + "loss": 0.1864, + "step": 3091 + }, + { + "epoch": 0.049472, + "grad_norm": 0.92578125, + "learning_rate": 9.582096774193548e-05, + "loss": 0.1615, + "step": 3092 + }, + { + "epoch": 0.049488, + "grad_norm": 0.71484375, + "learning_rate": 9.581935483870968e-05, + "loss": 0.1736, + "step": 3093 + }, + { + "epoch": 0.049504, + "grad_norm": 1.6171875, + "learning_rate": 9.581774193548387e-05, + "loss": 0.2397, + "step": 3094 + }, + { + "epoch": 0.04952, + "grad_norm": 1.1484375, + "learning_rate": 9.581612903225807e-05, + "loss": 0.2397, + "step": 3095 + }, + { + "epoch": 0.049536, + "grad_norm": 1.0234375, + "learning_rate": 9.581451612903226e-05, + "loss": 0.1465, + "step": 3096 + }, + { + "epoch": 0.049552, + "grad_norm": 0.74609375, + "learning_rate": 9.581290322580645e-05, + "loss": 0.1966, + "step": 3097 + }, + { + "epoch": 0.049568, + "grad_norm": 1.1953125, + "learning_rate": 9.581129032258065e-05, + "loss": 0.2117, + "step": 3098 + }, + { + "epoch": 0.049584, + "grad_norm": 0.734375, + "learning_rate": 9.580967741935485e-05, + "loss": 0.1535, + "step": 3099 + }, + { + "epoch": 0.0496, + "grad_norm": 0.6875, + "learning_rate": 9.580806451612904e-05, + "loss": 0.1781, + "step": 3100 + }, + { + "epoch": 0.049616, + "grad_norm": 1.03125, + "learning_rate": 9.580645161290323e-05, + "loss": 0.2007, + "step": 3101 + }, + { + "epoch": 0.049632, + "grad_norm": 0.7890625, + "learning_rate": 9.580483870967743e-05, + "loss": 0.167, + "step": 3102 + }, + { + "epoch": 0.049648, + "grad_norm": 1.109375, + "learning_rate": 9.580322580645161e-05, + "loss": 0.1721, + "step": 3103 + }, + { + "epoch": 0.049664, + "grad_norm": 0.66796875, + "learning_rate": 9.580161290322581e-05, + "loss": 0.1489, + "step": 3104 + }, + { + "epoch": 0.04968, + "grad_norm": 0.875, + "learning_rate": 9.58e-05, + "loss": 0.1996, + "step": 3105 + }, + { + "epoch": 0.049696, + "grad_norm": 1.3671875, + "learning_rate": 9.57983870967742e-05, + "loss": 0.2335, + "step": 3106 + }, + { + "epoch": 0.049712, + "grad_norm": 0.796875, + "learning_rate": 9.579677419354838e-05, + "loss": 0.2009, + "step": 3107 + }, + { + "epoch": 0.049728, + "grad_norm": 1.0546875, + "learning_rate": 9.579516129032258e-05, + "loss": 0.2116, + "step": 3108 + }, + { + "epoch": 0.049744, + "grad_norm": 1.21875, + "learning_rate": 9.579354838709678e-05, + "loss": 0.2131, + "step": 3109 + }, + { + "epoch": 0.04976, + "grad_norm": 0.90234375, + "learning_rate": 9.579193548387098e-05, + "loss": 0.1409, + "step": 3110 + }, + { + "epoch": 0.049776, + "grad_norm": 1.03125, + "learning_rate": 9.579032258064517e-05, + "loss": 0.1777, + "step": 3111 + }, + { + "epoch": 0.049792, + "grad_norm": 1.1328125, + "learning_rate": 9.578870967741937e-05, + "loss": 0.1866, + "step": 3112 + }, + { + "epoch": 0.049808, + "grad_norm": 1.3046875, + "learning_rate": 9.578709677419355e-05, + "loss": 0.18, + "step": 3113 + }, + { + "epoch": 0.049824, + "grad_norm": 1.1953125, + "learning_rate": 9.578548387096775e-05, + "loss": 0.1591, + "step": 3114 + }, + { + "epoch": 0.04984, + "grad_norm": 0.89453125, + "learning_rate": 9.578387096774194e-05, + "loss": 0.1725, + "step": 3115 + }, + { + "epoch": 0.049856, + "grad_norm": 0.78125, + "learning_rate": 9.578225806451613e-05, + "loss": 0.1941, + "step": 3116 + }, + { + "epoch": 0.049872, + "grad_norm": 0.765625, + "learning_rate": 9.578064516129033e-05, + "loss": 0.1648, + "step": 3117 + }, + { + "epoch": 0.049888, + "grad_norm": 0.96484375, + "learning_rate": 9.577903225806451e-05, + "loss": 0.1676, + "step": 3118 + }, + { + "epoch": 0.049904, + "grad_norm": 0.86328125, + "learning_rate": 9.577741935483871e-05, + "loss": 0.2091, + "step": 3119 + }, + { + "epoch": 0.04992, + "grad_norm": 0.63671875, + "learning_rate": 9.57758064516129e-05, + "loss": 0.1375, + "step": 3120 + }, + { + "epoch": 0.049936, + "grad_norm": 0.86328125, + "learning_rate": 9.57741935483871e-05, + "loss": 0.2127, + "step": 3121 + }, + { + "epoch": 0.049952, + "grad_norm": 1.5078125, + "learning_rate": 9.57725806451613e-05, + "loss": 0.1922, + "step": 3122 + }, + { + "epoch": 0.049968, + "grad_norm": 1.015625, + "learning_rate": 9.57709677419355e-05, + "loss": 0.1724, + "step": 3123 + }, + { + "epoch": 0.049984, + "grad_norm": 1.1328125, + "learning_rate": 9.576935483870968e-05, + "loss": 0.2059, + "step": 3124 + }, + { + "epoch": 0.05, + "grad_norm": 0.79296875, + "learning_rate": 9.576774193548388e-05, + "loss": 0.1365, + "step": 3125 + }, + { + "epoch": 0.050016, + "grad_norm": 0.73828125, + "learning_rate": 9.576612903225807e-05, + "loss": 0.2059, + "step": 3126 + }, + { + "epoch": 0.050032, + "grad_norm": 1.25, + "learning_rate": 9.576451612903227e-05, + "loss": 0.2226, + "step": 3127 + }, + { + "epoch": 0.050048, + "grad_norm": 0.70703125, + "learning_rate": 9.576290322580645e-05, + "loss": 0.1529, + "step": 3128 + }, + { + "epoch": 0.050064, + "grad_norm": 0.87890625, + "learning_rate": 9.576129032258065e-05, + "loss": 0.1837, + "step": 3129 + }, + { + "epoch": 0.05008, + "grad_norm": 0.85546875, + "learning_rate": 9.575967741935484e-05, + "loss": 0.2053, + "step": 3130 + }, + { + "epoch": 0.050096, + "grad_norm": 0.80078125, + "learning_rate": 9.575806451612904e-05, + "loss": 0.1853, + "step": 3131 + }, + { + "epoch": 0.050112, + "grad_norm": 0.78515625, + "learning_rate": 9.575645161290322e-05, + "loss": 0.1825, + "step": 3132 + }, + { + "epoch": 0.050128, + "grad_norm": 1.375, + "learning_rate": 9.575483870967742e-05, + "loss": 0.2459, + "step": 3133 + }, + { + "epoch": 0.050144, + "grad_norm": 0.98828125, + "learning_rate": 9.575322580645162e-05, + "loss": 0.185, + "step": 3134 + }, + { + "epoch": 0.05016, + "grad_norm": 0.828125, + "learning_rate": 9.575161290322581e-05, + "loss": 0.2057, + "step": 3135 + }, + { + "epoch": 0.050176, + "grad_norm": 0.97265625, + "learning_rate": 9.575000000000001e-05, + "loss": 0.1806, + "step": 3136 + }, + { + "epoch": 0.050192, + "grad_norm": 1.015625, + "learning_rate": 9.57483870967742e-05, + "loss": 0.1767, + "step": 3137 + }, + { + "epoch": 0.050208, + "grad_norm": 1.453125, + "learning_rate": 9.57467741935484e-05, + "loss": 0.2059, + "step": 3138 + }, + { + "epoch": 0.050224, + "grad_norm": 0.7734375, + "learning_rate": 9.574516129032258e-05, + "loss": 0.1901, + "step": 3139 + }, + { + "epoch": 0.05024, + "grad_norm": 0.765625, + "learning_rate": 9.574354838709678e-05, + "loss": 0.2364, + "step": 3140 + }, + { + "epoch": 0.050256, + "grad_norm": 0.6953125, + "learning_rate": 9.574193548387097e-05, + "loss": 0.1558, + "step": 3141 + }, + { + "epoch": 0.050272, + "grad_norm": 0.91796875, + "learning_rate": 9.574032258064517e-05, + "loss": 0.1833, + "step": 3142 + }, + { + "epoch": 0.050288, + "grad_norm": 0.8046875, + "learning_rate": 9.573870967741935e-05, + "loss": 0.1947, + "step": 3143 + }, + { + "epoch": 0.050304, + "grad_norm": 1.0390625, + "learning_rate": 9.573709677419355e-05, + "loss": 0.2065, + "step": 3144 + }, + { + "epoch": 0.05032, + "grad_norm": 0.94921875, + "learning_rate": 9.573548387096775e-05, + "loss": 0.2016, + "step": 3145 + }, + { + "epoch": 0.050336, + "grad_norm": 1.0859375, + "learning_rate": 9.573387096774195e-05, + "loss": 0.1781, + "step": 3146 + }, + { + "epoch": 0.050352, + "grad_norm": 0.85546875, + "learning_rate": 9.573225806451614e-05, + "loss": 0.1813, + "step": 3147 + }, + { + "epoch": 0.050368, + "grad_norm": 0.91015625, + "learning_rate": 9.573064516129032e-05, + "loss": 0.1796, + "step": 3148 + }, + { + "epoch": 0.050384, + "grad_norm": 1.34375, + "learning_rate": 9.572903225806452e-05, + "loss": 0.1993, + "step": 3149 + }, + { + "epoch": 0.0504, + "grad_norm": 1.015625, + "learning_rate": 9.572741935483871e-05, + "loss": 0.1981, + "step": 3150 + }, + { + "epoch": 0.050416, + "grad_norm": 0.86328125, + "learning_rate": 9.572580645161291e-05, + "loss": 0.1943, + "step": 3151 + }, + { + "epoch": 0.050432, + "grad_norm": 0.63671875, + "learning_rate": 9.57241935483871e-05, + "loss": 0.1874, + "step": 3152 + }, + { + "epoch": 0.050448, + "grad_norm": 0.9921875, + "learning_rate": 9.57225806451613e-05, + "loss": 0.1844, + "step": 3153 + }, + { + "epoch": 0.050464, + "grad_norm": 1.1796875, + "learning_rate": 9.572096774193548e-05, + "loss": 0.1591, + "step": 3154 + }, + { + "epoch": 0.05048, + "grad_norm": 0.921875, + "learning_rate": 9.571935483870968e-05, + "loss": 0.1993, + "step": 3155 + }, + { + "epoch": 0.050496, + "grad_norm": 0.7578125, + "learning_rate": 9.571774193548387e-05, + "loss": 0.214, + "step": 3156 + }, + { + "epoch": 0.050512, + "grad_norm": 0.61328125, + "learning_rate": 9.571612903225807e-05, + "loss": 0.1745, + "step": 3157 + }, + { + "epoch": 0.050528, + "grad_norm": 0.75390625, + "learning_rate": 9.571451612903227e-05, + "loss": 0.1774, + "step": 3158 + }, + { + "epoch": 0.050544, + "grad_norm": 0.7265625, + "learning_rate": 9.571290322580647e-05, + "loss": 0.166, + "step": 3159 + }, + { + "epoch": 0.05056, + "grad_norm": 0.47265625, + "learning_rate": 9.571129032258065e-05, + "loss": 0.1299, + "step": 3160 + }, + { + "epoch": 0.050576, + "grad_norm": 0.82421875, + "learning_rate": 9.570967741935485e-05, + "loss": 0.1986, + "step": 3161 + }, + { + "epoch": 0.050592, + "grad_norm": 0.97265625, + "learning_rate": 9.570806451612904e-05, + "loss": 0.2054, + "step": 3162 + }, + { + "epoch": 0.050608, + "grad_norm": 0.8359375, + "learning_rate": 9.570645161290322e-05, + "loss": 0.1699, + "step": 3163 + }, + { + "epoch": 0.050624, + "grad_norm": 1.125, + "learning_rate": 9.570483870967742e-05, + "loss": 0.1303, + "step": 3164 + }, + { + "epoch": 0.05064, + "grad_norm": 1.0, + "learning_rate": 9.570322580645161e-05, + "loss": 0.1608, + "step": 3165 + }, + { + "epoch": 0.050656, + "grad_norm": 0.7578125, + "learning_rate": 9.570161290322581e-05, + "loss": 0.1988, + "step": 3166 + }, + { + "epoch": 0.050672, + "grad_norm": 1.296875, + "learning_rate": 9.57e-05, + "loss": 0.2332, + "step": 3167 + }, + { + "epoch": 0.050688, + "grad_norm": 0.76953125, + "learning_rate": 9.56983870967742e-05, + "loss": 0.1666, + "step": 3168 + }, + { + "epoch": 0.050704, + "grad_norm": 0.6875, + "learning_rate": 9.56967741935484e-05, + "loss": 0.1708, + "step": 3169 + }, + { + "epoch": 0.05072, + "grad_norm": 0.8515625, + "learning_rate": 9.56951612903226e-05, + "loss": 0.1724, + "step": 3170 + }, + { + "epoch": 0.050736, + "grad_norm": 0.71484375, + "learning_rate": 9.569354838709678e-05, + "loss": 0.2062, + "step": 3171 + }, + { + "epoch": 0.050752, + "grad_norm": 0.8984375, + "learning_rate": 9.569193548387098e-05, + "loss": 0.1728, + "step": 3172 + }, + { + "epoch": 0.050768, + "grad_norm": 0.9921875, + "learning_rate": 9.569032258064517e-05, + "loss": 0.1915, + "step": 3173 + }, + { + "epoch": 0.050784, + "grad_norm": 1.3203125, + "learning_rate": 9.568870967741937e-05, + "loss": 0.204, + "step": 3174 + }, + { + "epoch": 0.0508, + "grad_norm": 0.84765625, + "learning_rate": 9.568709677419355e-05, + "loss": 0.2027, + "step": 3175 + }, + { + "epoch": 0.050816, + "grad_norm": 0.76953125, + "learning_rate": 9.568548387096775e-05, + "loss": 0.19, + "step": 3176 + }, + { + "epoch": 0.050832, + "grad_norm": 0.921875, + "learning_rate": 9.568387096774194e-05, + "loss": 0.1952, + "step": 3177 + }, + { + "epoch": 0.050848, + "grad_norm": 0.94140625, + "learning_rate": 9.568225806451612e-05, + "loss": 0.2092, + "step": 3178 + }, + { + "epoch": 0.050864, + "grad_norm": 0.59375, + "learning_rate": 9.568064516129032e-05, + "loss": 0.148, + "step": 3179 + }, + { + "epoch": 0.05088, + "grad_norm": 0.66015625, + "learning_rate": 9.567903225806452e-05, + "loss": 0.1855, + "step": 3180 + }, + { + "epoch": 0.050896, + "grad_norm": 0.6875, + "learning_rate": 9.567741935483872e-05, + "loss": 0.2112, + "step": 3181 + }, + { + "epoch": 0.050912, + "grad_norm": 0.890625, + "learning_rate": 9.567580645161291e-05, + "loss": 0.1876, + "step": 3182 + }, + { + "epoch": 0.050928, + "grad_norm": 0.84375, + "learning_rate": 9.567419354838711e-05, + "loss": 0.1724, + "step": 3183 + }, + { + "epoch": 0.050944, + "grad_norm": 0.70703125, + "learning_rate": 9.56725806451613e-05, + "loss": 0.2163, + "step": 3184 + }, + { + "epoch": 0.05096, + "grad_norm": 1.0546875, + "learning_rate": 9.56709677419355e-05, + "loss": 0.2648, + "step": 3185 + }, + { + "epoch": 0.050976, + "grad_norm": 0.9296875, + "learning_rate": 9.566935483870968e-05, + "loss": 0.1815, + "step": 3186 + }, + { + "epoch": 0.050992, + "grad_norm": 0.703125, + "learning_rate": 9.566774193548388e-05, + "loss": 0.1823, + "step": 3187 + }, + { + "epoch": 0.051008, + "grad_norm": 0.9140625, + "learning_rate": 9.566612903225807e-05, + "loss": 0.2279, + "step": 3188 + }, + { + "epoch": 0.051024, + "grad_norm": 0.984375, + "learning_rate": 9.566451612903227e-05, + "loss": 0.189, + "step": 3189 + }, + { + "epoch": 0.05104, + "grad_norm": 0.9921875, + "learning_rate": 9.566290322580645e-05, + "loss": 0.1836, + "step": 3190 + }, + { + "epoch": 0.051056, + "grad_norm": 1.0234375, + "learning_rate": 9.566129032258065e-05, + "loss": 0.1555, + "step": 3191 + }, + { + "epoch": 0.051072, + "grad_norm": 0.78125, + "learning_rate": 9.565967741935484e-05, + "loss": 0.1721, + "step": 3192 + }, + { + "epoch": 0.051088, + "grad_norm": 0.96484375, + "learning_rate": 9.565806451612904e-05, + "loss": 0.1657, + "step": 3193 + }, + { + "epoch": 0.051104, + "grad_norm": 1.109375, + "learning_rate": 9.565645161290324e-05, + "loss": 0.2038, + "step": 3194 + }, + { + "epoch": 0.05112, + "grad_norm": 0.79296875, + "learning_rate": 9.565483870967742e-05, + "loss": 0.2042, + "step": 3195 + }, + { + "epoch": 0.051136, + "grad_norm": 0.90234375, + "learning_rate": 9.565322580645162e-05, + "loss": 0.1992, + "step": 3196 + }, + { + "epoch": 0.051152, + "grad_norm": 1.0078125, + "learning_rate": 9.565161290322581e-05, + "loss": 0.1968, + "step": 3197 + }, + { + "epoch": 0.051168, + "grad_norm": 0.921875, + "learning_rate": 9.565000000000001e-05, + "loss": 0.1444, + "step": 3198 + }, + { + "epoch": 0.051184, + "grad_norm": 0.734375, + "learning_rate": 9.56483870967742e-05, + "loss": 0.1673, + "step": 3199 + }, + { + "epoch": 0.0512, + "grad_norm": 0.83203125, + "learning_rate": 9.564677419354839e-05, + "loss": 0.1754, + "step": 3200 + }, + { + "epoch": 0.051216, + "grad_norm": 0.94921875, + "learning_rate": 9.564516129032258e-05, + "loss": 0.1934, + "step": 3201 + }, + { + "epoch": 0.051232, + "grad_norm": 0.88671875, + "learning_rate": 9.564354838709678e-05, + "loss": 0.1847, + "step": 3202 + }, + { + "epoch": 0.051248, + "grad_norm": 0.91796875, + "learning_rate": 9.564193548387096e-05, + "loss": 0.1815, + "step": 3203 + }, + { + "epoch": 0.051264, + "grad_norm": 1.2578125, + "learning_rate": 9.564032258064516e-05, + "loss": 0.1617, + "step": 3204 + }, + { + "epoch": 0.05128, + "grad_norm": 0.8984375, + "learning_rate": 9.563870967741936e-05, + "loss": 0.1965, + "step": 3205 + }, + { + "epoch": 0.051296, + "grad_norm": 0.90234375, + "learning_rate": 9.563709677419356e-05, + "loss": 0.1599, + "step": 3206 + }, + { + "epoch": 0.051312, + "grad_norm": 0.8671875, + "learning_rate": 9.563548387096775e-05, + "loss": 0.1636, + "step": 3207 + }, + { + "epoch": 0.051328, + "grad_norm": 0.6328125, + "learning_rate": 9.563387096774195e-05, + "loss": 0.1272, + "step": 3208 + }, + { + "epoch": 0.051344, + "grad_norm": 0.72265625, + "learning_rate": 9.563225806451614e-05, + "loss": 0.1824, + "step": 3209 + }, + { + "epoch": 0.05136, + "grad_norm": 0.6953125, + "learning_rate": 9.563064516129032e-05, + "loss": 0.1909, + "step": 3210 + }, + { + "epoch": 0.051376, + "grad_norm": 1.109375, + "learning_rate": 9.562903225806452e-05, + "loss": 0.1957, + "step": 3211 + }, + { + "epoch": 0.051392, + "grad_norm": 0.88671875, + "learning_rate": 9.562741935483871e-05, + "loss": 0.2156, + "step": 3212 + }, + { + "epoch": 0.051408, + "grad_norm": 0.87890625, + "learning_rate": 9.562580645161291e-05, + "loss": 0.217, + "step": 3213 + }, + { + "epoch": 0.051424, + "grad_norm": 0.67578125, + "learning_rate": 9.562419354838709e-05, + "loss": 0.1902, + "step": 3214 + }, + { + "epoch": 0.05144, + "grad_norm": 0.72265625, + "learning_rate": 9.562258064516129e-05, + "loss": 0.1772, + "step": 3215 + }, + { + "epoch": 0.051456, + "grad_norm": 0.78125, + "learning_rate": 9.562096774193548e-05, + "loss": 0.1717, + "step": 3216 + }, + { + "epoch": 0.051472, + "grad_norm": 0.71484375, + "learning_rate": 9.561935483870968e-05, + "loss": 0.1764, + "step": 3217 + }, + { + "epoch": 0.051488, + "grad_norm": 0.8828125, + "learning_rate": 9.561774193548388e-05, + "loss": 0.1673, + "step": 3218 + }, + { + "epoch": 0.051504, + "grad_norm": 0.74609375, + "learning_rate": 9.561612903225808e-05, + "loss": 0.2016, + "step": 3219 + }, + { + "epoch": 0.05152, + "grad_norm": 0.77734375, + "learning_rate": 9.561451612903226e-05, + "loss": 0.1858, + "step": 3220 + }, + { + "epoch": 0.051536, + "grad_norm": 0.9453125, + "learning_rate": 9.561290322580646e-05, + "loss": 0.2379, + "step": 3221 + }, + { + "epoch": 0.051552, + "grad_norm": 1.3828125, + "learning_rate": 9.561129032258065e-05, + "loss": 0.196, + "step": 3222 + }, + { + "epoch": 0.051568, + "grad_norm": 0.984375, + "learning_rate": 9.560967741935485e-05, + "loss": 0.2006, + "step": 3223 + }, + { + "epoch": 0.051584, + "grad_norm": 0.74609375, + "learning_rate": 9.560806451612904e-05, + "loss": 0.1789, + "step": 3224 + }, + { + "epoch": 0.0516, + "grad_norm": 1.2109375, + "learning_rate": 9.560645161290322e-05, + "loss": 0.2084, + "step": 3225 + }, + { + "epoch": 0.051616, + "grad_norm": 1.34375, + "learning_rate": 9.560483870967742e-05, + "loss": 0.187, + "step": 3226 + }, + { + "epoch": 0.051632, + "grad_norm": 1.4140625, + "learning_rate": 9.560322580645161e-05, + "loss": 0.1607, + "step": 3227 + }, + { + "epoch": 0.051648, + "grad_norm": 0.75, + "learning_rate": 9.56016129032258e-05, + "loss": 0.1862, + "step": 3228 + }, + { + "epoch": 0.051664, + "grad_norm": 0.9375, + "learning_rate": 9.56e-05, + "loss": 0.1758, + "step": 3229 + }, + { + "epoch": 0.05168, + "grad_norm": 0.87890625, + "learning_rate": 9.55983870967742e-05, + "loss": 0.1987, + "step": 3230 + }, + { + "epoch": 0.051696, + "grad_norm": 1.2734375, + "learning_rate": 9.559677419354839e-05, + "loss": 0.1818, + "step": 3231 + }, + { + "epoch": 0.051712, + "grad_norm": 1.1640625, + "learning_rate": 9.559516129032259e-05, + "loss": 0.1631, + "step": 3232 + }, + { + "epoch": 0.051728, + "grad_norm": 0.78125, + "learning_rate": 9.559354838709678e-05, + "loss": 0.144, + "step": 3233 + }, + { + "epoch": 0.051744, + "grad_norm": 1.1328125, + "learning_rate": 9.559193548387098e-05, + "loss": 0.1528, + "step": 3234 + }, + { + "epoch": 0.05176, + "grad_norm": 1.2265625, + "learning_rate": 9.559032258064516e-05, + "loss": 0.2231, + "step": 3235 + }, + { + "epoch": 0.051776, + "grad_norm": 0.96484375, + "learning_rate": 9.558870967741936e-05, + "loss": 0.224, + "step": 3236 + }, + { + "epoch": 0.051792, + "grad_norm": 0.88671875, + "learning_rate": 9.558709677419355e-05, + "loss": 0.1863, + "step": 3237 + }, + { + "epoch": 0.051808, + "grad_norm": 1.5078125, + "learning_rate": 9.558548387096775e-05, + "loss": 0.2397, + "step": 3238 + }, + { + "epoch": 0.051824, + "grad_norm": 0.74609375, + "learning_rate": 9.558387096774193e-05, + "loss": 0.2149, + "step": 3239 + }, + { + "epoch": 0.05184, + "grad_norm": 0.98046875, + "learning_rate": 9.558225806451613e-05, + "loss": 0.1649, + "step": 3240 + }, + { + "epoch": 0.051856, + "grad_norm": 0.85546875, + "learning_rate": 9.558064516129033e-05, + "loss": 0.1723, + "step": 3241 + }, + { + "epoch": 0.051872, + "grad_norm": 0.77734375, + "learning_rate": 9.557903225806452e-05, + "loss": 0.1744, + "step": 3242 + }, + { + "epoch": 0.051888, + "grad_norm": 0.9765625, + "learning_rate": 9.557741935483872e-05, + "loss": 0.1973, + "step": 3243 + }, + { + "epoch": 0.051904, + "grad_norm": 0.75, + "learning_rate": 9.55758064516129e-05, + "loss": 0.1641, + "step": 3244 + }, + { + "epoch": 0.05192, + "grad_norm": 1.078125, + "learning_rate": 9.55741935483871e-05, + "loss": 0.2192, + "step": 3245 + }, + { + "epoch": 0.051936, + "grad_norm": 1.234375, + "learning_rate": 9.557258064516129e-05, + "loss": 0.2412, + "step": 3246 + }, + { + "epoch": 0.051952, + "grad_norm": 0.828125, + "learning_rate": 9.557096774193549e-05, + "loss": 0.1723, + "step": 3247 + }, + { + "epoch": 0.051968, + "grad_norm": 0.859375, + "learning_rate": 9.556935483870968e-05, + "loss": 0.1762, + "step": 3248 + }, + { + "epoch": 0.051984, + "grad_norm": 0.69921875, + "learning_rate": 9.556774193548388e-05, + "loss": 0.1483, + "step": 3249 + }, + { + "epoch": 0.052, + "grad_norm": 2.1875, + "learning_rate": 9.556612903225806e-05, + "loss": 0.2041, + "step": 3250 + }, + { + "epoch": 0.052016, + "grad_norm": 1.0625, + "learning_rate": 9.556451612903226e-05, + "loss": 0.192, + "step": 3251 + }, + { + "epoch": 0.052032, + "grad_norm": 0.87109375, + "learning_rate": 9.556290322580645e-05, + "loss": 0.2096, + "step": 3252 + }, + { + "epoch": 0.052048, + "grad_norm": 1.015625, + "learning_rate": 9.556129032258065e-05, + "loss": 0.2428, + "step": 3253 + }, + { + "epoch": 0.052064, + "grad_norm": 1.421875, + "learning_rate": 9.555967741935485e-05, + "loss": 0.2213, + "step": 3254 + }, + { + "epoch": 0.05208, + "grad_norm": 1.3203125, + "learning_rate": 9.555806451612905e-05, + "loss": 0.1853, + "step": 3255 + }, + { + "epoch": 0.052096, + "grad_norm": 1.0859375, + "learning_rate": 9.555645161290323e-05, + "loss": 0.1872, + "step": 3256 + }, + { + "epoch": 0.052112, + "grad_norm": 0.68359375, + "learning_rate": 9.555483870967742e-05, + "loss": 0.175, + "step": 3257 + }, + { + "epoch": 0.052128, + "grad_norm": 0.671875, + "learning_rate": 9.555322580645162e-05, + "loss": 0.1911, + "step": 3258 + }, + { + "epoch": 0.052144, + "grad_norm": 1.2890625, + "learning_rate": 9.55516129032258e-05, + "loss": 0.181, + "step": 3259 + }, + { + "epoch": 0.05216, + "grad_norm": 0.79296875, + "learning_rate": 9.555e-05, + "loss": 0.1939, + "step": 3260 + }, + { + "epoch": 0.052176, + "grad_norm": 0.9765625, + "learning_rate": 9.554838709677419e-05, + "loss": 0.1478, + "step": 3261 + }, + { + "epoch": 0.052192, + "grad_norm": 1.28125, + "learning_rate": 9.554677419354839e-05, + "loss": 0.1539, + "step": 3262 + }, + { + "epoch": 0.052208, + "grad_norm": 0.8203125, + "learning_rate": 9.554516129032258e-05, + "loss": 0.1829, + "step": 3263 + }, + { + "epoch": 0.052224, + "grad_norm": 0.6640625, + "learning_rate": 9.554354838709678e-05, + "loss": 0.1729, + "step": 3264 + }, + { + "epoch": 0.05224, + "grad_norm": 0.94140625, + "learning_rate": 9.554193548387098e-05, + "loss": 0.1864, + "step": 3265 + }, + { + "epoch": 0.052256, + "grad_norm": 0.80859375, + "learning_rate": 9.554032258064518e-05, + "loss": 0.1615, + "step": 3266 + }, + { + "epoch": 0.052272, + "grad_norm": 0.94921875, + "learning_rate": 9.553870967741936e-05, + "loss": 0.1974, + "step": 3267 + }, + { + "epoch": 0.052288, + "grad_norm": 1.1875, + "learning_rate": 9.553709677419356e-05, + "loss": 0.1659, + "step": 3268 + }, + { + "epoch": 0.052304, + "grad_norm": 0.9453125, + "learning_rate": 9.553548387096775e-05, + "loss": 0.1723, + "step": 3269 + }, + { + "epoch": 0.05232, + "grad_norm": 0.97265625, + "learning_rate": 9.553387096774195e-05, + "loss": 0.1838, + "step": 3270 + }, + { + "epoch": 0.052336, + "grad_norm": 0.91796875, + "learning_rate": 9.553225806451613e-05, + "loss": 0.2007, + "step": 3271 + }, + { + "epoch": 0.052352, + "grad_norm": 1.25, + "learning_rate": 9.553064516129032e-05, + "loss": 0.1897, + "step": 3272 + }, + { + "epoch": 0.052368, + "grad_norm": 1.1875, + "learning_rate": 9.552903225806452e-05, + "loss": 0.229, + "step": 3273 + }, + { + "epoch": 0.052384, + "grad_norm": 1.2578125, + "learning_rate": 9.55274193548387e-05, + "loss": 0.1618, + "step": 3274 + }, + { + "epoch": 0.0524, + "grad_norm": 1.46875, + "learning_rate": 9.55258064516129e-05, + "loss": 0.2105, + "step": 3275 + }, + { + "epoch": 0.052416, + "grad_norm": 0.73828125, + "learning_rate": 9.55241935483871e-05, + "loss": 0.1521, + "step": 3276 + }, + { + "epoch": 0.052432, + "grad_norm": 1.0078125, + "learning_rate": 9.552258064516129e-05, + "loss": 0.1882, + "step": 3277 + }, + { + "epoch": 0.052448, + "grad_norm": 0.94921875, + "learning_rate": 9.552096774193549e-05, + "loss": 0.1673, + "step": 3278 + }, + { + "epoch": 0.052464, + "grad_norm": 1.28125, + "learning_rate": 9.551935483870969e-05, + "loss": 0.1808, + "step": 3279 + }, + { + "epoch": 0.05248, + "grad_norm": 0.9921875, + "learning_rate": 9.551774193548388e-05, + "loss": 0.229, + "step": 3280 + }, + { + "epoch": 0.052496, + "grad_norm": 1.59375, + "learning_rate": 9.551612903225808e-05, + "loss": 0.1723, + "step": 3281 + }, + { + "epoch": 0.052512, + "grad_norm": 1.21875, + "learning_rate": 9.551451612903226e-05, + "loss": 0.1965, + "step": 3282 + }, + { + "epoch": 0.052528, + "grad_norm": 0.94140625, + "learning_rate": 9.551290322580646e-05, + "loss": 0.1893, + "step": 3283 + }, + { + "epoch": 0.052544, + "grad_norm": 0.671875, + "learning_rate": 9.551129032258065e-05, + "loss": 0.187, + "step": 3284 + }, + { + "epoch": 0.05256, + "grad_norm": 1.1640625, + "learning_rate": 9.550967741935485e-05, + "loss": 0.1808, + "step": 3285 + }, + { + "epoch": 0.052576, + "grad_norm": 0.671875, + "learning_rate": 9.550806451612903e-05, + "loss": 0.1384, + "step": 3286 + }, + { + "epoch": 0.052592, + "grad_norm": 1.109375, + "learning_rate": 9.550645161290322e-05, + "loss": 0.1796, + "step": 3287 + }, + { + "epoch": 0.052608, + "grad_norm": 0.73828125, + "learning_rate": 9.550483870967742e-05, + "loss": 0.1982, + "step": 3288 + }, + { + "epoch": 0.052624, + "grad_norm": 0.765625, + "learning_rate": 9.550322580645162e-05, + "loss": 0.156, + "step": 3289 + }, + { + "epoch": 0.05264, + "grad_norm": 0.9453125, + "learning_rate": 9.550161290322582e-05, + "loss": 0.2036, + "step": 3290 + }, + { + "epoch": 0.052656, + "grad_norm": 0.625, + "learning_rate": 9.55e-05, + "loss": 0.1625, + "step": 3291 + }, + { + "epoch": 0.052672, + "grad_norm": 0.84765625, + "learning_rate": 9.54983870967742e-05, + "loss": 0.1983, + "step": 3292 + }, + { + "epoch": 0.052688, + "grad_norm": 1.0859375, + "learning_rate": 9.549677419354839e-05, + "loss": 0.1999, + "step": 3293 + }, + { + "epoch": 0.052704, + "grad_norm": 1.421875, + "learning_rate": 9.549516129032259e-05, + "loss": 0.1629, + "step": 3294 + }, + { + "epoch": 0.05272, + "grad_norm": 0.8203125, + "learning_rate": 9.549354838709678e-05, + "loss": 0.2266, + "step": 3295 + }, + { + "epoch": 0.052736, + "grad_norm": 0.9375, + "learning_rate": 9.549193548387097e-05, + "loss": 0.2082, + "step": 3296 + }, + { + "epoch": 0.052752, + "grad_norm": 0.62109375, + "learning_rate": 9.549032258064516e-05, + "loss": 0.1607, + "step": 3297 + }, + { + "epoch": 0.052768, + "grad_norm": 0.71484375, + "learning_rate": 9.548870967741936e-05, + "loss": 0.1648, + "step": 3298 + }, + { + "epoch": 0.052784, + "grad_norm": 0.7421875, + "learning_rate": 9.548709677419355e-05, + "loss": 0.1875, + "step": 3299 + }, + { + "epoch": 0.0528, + "grad_norm": 1.421875, + "learning_rate": 9.548548387096775e-05, + "loss": 0.1971, + "step": 3300 + }, + { + "epoch": 0.052816, + "grad_norm": 1.0078125, + "learning_rate": 9.548387096774195e-05, + "loss": 0.1733, + "step": 3301 + }, + { + "epoch": 0.052832, + "grad_norm": 0.58984375, + "learning_rate": 9.548225806451615e-05, + "loss": 0.1614, + "step": 3302 + }, + { + "epoch": 0.052848, + "grad_norm": 0.98828125, + "learning_rate": 9.548064516129033e-05, + "loss": 0.2337, + "step": 3303 + }, + { + "epoch": 0.052864, + "grad_norm": 1.0546875, + "learning_rate": 9.547903225806452e-05, + "loss": 0.2293, + "step": 3304 + }, + { + "epoch": 0.05288, + "grad_norm": 0.75390625, + "learning_rate": 9.547741935483872e-05, + "loss": 0.1831, + "step": 3305 + }, + { + "epoch": 0.052896, + "grad_norm": 0.765625, + "learning_rate": 9.54758064516129e-05, + "loss": 0.1849, + "step": 3306 + }, + { + "epoch": 0.052912, + "grad_norm": 0.8984375, + "learning_rate": 9.54741935483871e-05, + "loss": 0.1518, + "step": 3307 + }, + { + "epoch": 0.052928, + "grad_norm": 0.8515625, + "learning_rate": 9.547258064516129e-05, + "loss": 0.1585, + "step": 3308 + }, + { + "epoch": 0.052944, + "grad_norm": 0.9140625, + "learning_rate": 9.547096774193549e-05, + "loss": 0.1748, + "step": 3309 + }, + { + "epoch": 0.05296, + "grad_norm": 0.87109375, + "learning_rate": 9.546935483870967e-05, + "loss": 0.1578, + "step": 3310 + }, + { + "epoch": 0.052976, + "grad_norm": 0.875, + "learning_rate": 9.546774193548387e-05, + "loss": 0.1942, + "step": 3311 + }, + { + "epoch": 0.052992, + "grad_norm": 1.421875, + "learning_rate": 9.546612903225806e-05, + "loss": 0.1947, + "step": 3312 + }, + { + "epoch": 0.053008, + "grad_norm": 1.0, + "learning_rate": 9.546451612903226e-05, + "loss": 0.2015, + "step": 3313 + }, + { + "epoch": 0.053024, + "grad_norm": 0.63671875, + "learning_rate": 9.546290322580646e-05, + "loss": 0.1833, + "step": 3314 + }, + { + "epoch": 0.05304, + "grad_norm": 1.125, + "learning_rate": 9.546129032258066e-05, + "loss": 0.1739, + "step": 3315 + }, + { + "epoch": 0.053056, + "grad_norm": 0.76953125, + "learning_rate": 9.545967741935485e-05, + "loss": 0.1834, + "step": 3316 + }, + { + "epoch": 0.053072, + "grad_norm": 0.8828125, + "learning_rate": 9.545806451612905e-05, + "loss": 0.19, + "step": 3317 + }, + { + "epoch": 0.053088, + "grad_norm": 0.671875, + "learning_rate": 9.545645161290323e-05, + "loss": 0.147, + "step": 3318 + }, + { + "epoch": 0.053104, + "grad_norm": 0.88671875, + "learning_rate": 9.545483870967742e-05, + "loss": 0.2068, + "step": 3319 + }, + { + "epoch": 0.05312, + "grad_norm": 0.76171875, + "learning_rate": 9.545322580645162e-05, + "loss": 0.1797, + "step": 3320 + }, + { + "epoch": 0.053136, + "grad_norm": 0.92578125, + "learning_rate": 9.54516129032258e-05, + "loss": 0.2193, + "step": 3321 + }, + { + "epoch": 0.053152, + "grad_norm": 1.1328125, + "learning_rate": 9.545e-05, + "loss": 0.203, + "step": 3322 + }, + { + "epoch": 0.053168, + "grad_norm": 0.8984375, + "learning_rate": 9.544838709677419e-05, + "loss": 0.2072, + "step": 3323 + }, + { + "epoch": 0.053184, + "grad_norm": 0.66015625, + "learning_rate": 9.544677419354839e-05, + "loss": 0.1631, + "step": 3324 + }, + { + "epoch": 0.0532, + "grad_norm": 0.62890625, + "learning_rate": 9.544516129032259e-05, + "loss": 0.1608, + "step": 3325 + }, + { + "epoch": 0.053216, + "grad_norm": 0.7734375, + "learning_rate": 9.544354838709679e-05, + "loss": 0.2263, + "step": 3326 + }, + { + "epoch": 0.053232, + "grad_norm": 0.78515625, + "learning_rate": 9.544193548387097e-05, + "loss": 0.2338, + "step": 3327 + }, + { + "epoch": 0.053248, + "grad_norm": 0.92578125, + "learning_rate": 9.544032258064517e-05, + "loss": 0.2069, + "step": 3328 + }, + { + "epoch": 0.053264, + "grad_norm": 0.7421875, + "learning_rate": 9.543870967741936e-05, + "loss": 0.1566, + "step": 3329 + }, + { + "epoch": 0.05328, + "grad_norm": 0.76171875, + "learning_rate": 9.543709677419356e-05, + "loss": 0.2174, + "step": 3330 + }, + { + "epoch": 0.053296, + "grad_norm": 1.4765625, + "learning_rate": 9.543548387096775e-05, + "loss": 0.2009, + "step": 3331 + }, + { + "epoch": 0.053312, + "grad_norm": 0.86328125, + "learning_rate": 9.543387096774194e-05, + "loss": 0.1766, + "step": 3332 + }, + { + "epoch": 0.053328, + "grad_norm": 0.88671875, + "learning_rate": 9.543225806451613e-05, + "loss": 0.1841, + "step": 3333 + }, + { + "epoch": 0.053344, + "grad_norm": 0.953125, + "learning_rate": 9.543064516129032e-05, + "loss": 0.2023, + "step": 3334 + }, + { + "epoch": 0.05336, + "grad_norm": 0.95703125, + "learning_rate": 9.542903225806452e-05, + "loss": 0.2073, + "step": 3335 + }, + { + "epoch": 0.053376, + "grad_norm": 0.82421875, + "learning_rate": 9.542741935483872e-05, + "loss": 0.1539, + "step": 3336 + }, + { + "epoch": 0.053392, + "grad_norm": 0.57421875, + "learning_rate": 9.542580645161292e-05, + "loss": 0.1501, + "step": 3337 + }, + { + "epoch": 0.053408, + "grad_norm": 0.7265625, + "learning_rate": 9.54241935483871e-05, + "loss": 0.2013, + "step": 3338 + }, + { + "epoch": 0.053424, + "grad_norm": 0.515625, + "learning_rate": 9.54225806451613e-05, + "loss": 0.154, + "step": 3339 + }, + { + "epoch": 0.05344, + "grad_norm": 0.79296875, + "learning_rate": 9.542096774193549e-05, + "loss": 0.195, + "step": 3340 + }, + { + "epoch": 0.053456, + "grad_norm": 0.90234375, + "learning_rate": 9.541935483870969e-05, + "loss": 0.2172, + "step": 3341 + }, + { + "epoch": 0.053472, + "grad_norm": 1.09375, + "learning_rate": 9.541774193548387e-05, + "loss": 0.1873, + "step": 3342 + }, + { + "epoch": 0.053488, + "grad_norm": 0.85546875, + "learning_rate": 9.541612903225807e-05, + "loss": 0.1661, + "step": 3343 + }, + { + "epoch": 0.053504, + "grad_norm": 0.63671875, + "learning_rate": 9.541451612903226e-05, + "loss": 0.1483, + "step": 3344 + }, + { + "epoch": 0.05352, + "grad_norm": 0.96875, + "learning_rate": 9.541290322580646e-05, + "loss": 0.2019, + "step": 3345 + }, + { + "epoch": 0.053536, + "grad_norm": 0.7734375, + "learning_rate": 9.541129032258064e-05, + "loss": 0.2273, + "step": 3346 + }, + { + "epoch": 0.053552, + "grad_norm": 1.015625, + "learning_rate": 9.540967741935484e-05, + "loss": 0.229, + "step": 3347 + }, + { + "epoch": 0.053568, + "grad_norm": 0.66015625, + "learning_rate": 9.540806451612903e-05, + "loss": 0.1534, + "step": 3348 + }, + { + "epoch": 0.053584, + "grad_norm": 0.69140625, + "learning_rate": 9.540645161290323e-05, + "loss": 0.1739, + "step": 3349 + }, + { + "epoch": 0.0536, + "grad_norm": 0.61328125, + "learning_rate": 9.540483870967743e-05, + "loss": 0.1822, + "step": 3350 + }, + { + "epoch": 0.053616, + "grad_norm": 0.8359375, + "learning_rate": 9.540322580645162e-05, + "loss": 0.1781, + "step": 3351 + }, + { + "epoch": 0.053632, + "grad_norm": 0.5703125, + "learning_rate": 9.540161290322582e-05, + "loss": 0.1533, + "step": 3352 + }, + { + "epoch": 0.053648, + "grad_norm": 0.8046875, + "learning_rate": 9.54e-05, + "loss": 0.1869, + "step": 3353 + }, + { + "epoch": 0.053664, + "grad_norm": 0.77734375, + "learning_rate": 9.53983870967742e-05, + "loss": 0.2015, + "step": 3354 + }, + { + "epoch": 0.05368, + "grad_norm": 0.94140625, + "learning_rate": 9.539677419354839e-05, + "loss": 0.2141, + "step": 3355 + }, + { + "epoch": 0.053696, + "grad_norm": 1.0625, + "learning_rate": 9.539516129032259e-05, + "loss": 0.1892, + "step": 3356 + }, + { + "epoch": 0.053712, + "grad_norm": 0.82421875, + "learning_rate": 9.539354838709677e-05, + "loss": 0.1871, + "step": 3357 + }, + { + "epoch": 0.053728, + "grad_norm": 0.921875, + "learning_rate": 9.539193548387097e-05, + "loss": 0.1879, + "step": 3358 + }, + { + "epoch": 0.053744, + "grad_norm": 0.80078125, + "learning_rate": 9.539032258064516e-05, + "loss": 0.1743, + "step": 3359 + }, + { + "epoch": 0.05376, + "grad_norm": 1.15625, + "learning_rate": 9.538870967741936e-05, + "loss": 0.2037, + "step": 3360 + }, + { + "epoch": 0.053776, + "grad_norm": 0.78515625, + "learning_rate": 9.538709677419356e-05, + "loss": 0.1747, + "step": 3361 + }, + { + "epoch": 0.053792, + "grad_norm": 0.81640625, + "learning_rate": 9.538548387096776e-05, + "loss": 0.1533, + "step": 3362 + }, + { + "epoch": 0.053808, + "grad_norm": 1.03125, + "learning_rate": 9.538387096774194e-05, + "loss": 0.2209, + "step": 3363 + }, + { + "epoch": 0.053824, + "grad_norm": 0.78125, + "learning_rate": 9.538225806451614e-05, + "loss": 0.1671, + "step": 3364 + }, + { + "epoch": 0.05384, + "grad_norm": 0.98828125, + "learning_rate": 9.538064516129033e-05, + "loss": 0.1735, + "step": 3365 + }, + { + "epoch": 0.053856, + "grad_norm": 0.7734375, + "learning_rate": 9.537903225806452e-05, + "loss": 0.1736, + "step": 3366 + }, + { + "epoch": 0.053872, + "grad_norm": 0.859375, + "learning_rate": 9.537741935483871e-05, + "loss": 0.1574, + "step": 3367 + }, + { + "epoch": 0.053888, + "grad_norm": 0.859375, + "learning_rate": 9.53758064516129e-05, + "loss": 0.1757, + "step": 3368 + }, + { + "epoch": 0.053904, + "grad_norm": 0.71484375, + "learning_rate": 9.53741935483871e-05, + "loss": 0.1381, + "step": 3369 + }, + { + "epoch": 0.05392, + "grad_norm": 1.25, + "learning_rate": 9.537258064516129e-05, + "loss": 0.1652, + "step": 3370 + }, + { + "epoch": 0.053936, + "grad_norm": 1.3203125, + "learning_rate": 9.537096774193549e-05, + "loss": 0.1463, + "step": 3371 + }, + { + "epoch": 0.053952, + "grad_norm": 0.98046875, + "learning_rate": 9.536935483870967e-05, + "loss": 0.1621, + "step": 3372 + }, + { + "epoch": 0.053968, + "grad_norm": 0.9375, + "learning_rate": 9.536774193548387e-05, + "loss": 0.17, + "step": 3373 + }, + { + "epoch": 0.053984, + "grad_norm": 0.8515625, + "learning_rate": 9.536612903225807e-05, + "loss": 0.2068, + "step": 3374 + }, + { + "epoch": 0.054, + "grad_norm": 1.0234375, + "learning_rate": 9.536451612903227e-05, + "loss": 0.1958, + "step": 3375 + }, + { + "epoch": 0.054016, + "grad_norm": 0.88671875, + "learning_rate": 9.536290322580646e-05, + "loss": 0.1509, + "step": 3376 + }, + { + "epoch": 0.054032, + "grad_norm": 1.484375, + "learning_rate": 9.536129032258066e-05, + "loss": 0.2042, + "step": 3377 + }, + { + "epoch": 0.054048, + "grad_norm": 1.2421875, + "learning_rate": 9.535967741935484e-05, + "loss": 0.1901, + "step": 3378 + }, + { + "epoch": 0.054064, + "grad_norm": 0.94140625, + "learning_rate": 9.535806451612904e-05, + "loss": 0.1624, + "step": 3379 + }, + { + "epoch": 0.05408, + "grad_norm": 2.140625, + "learning_rate": 9.535645161290323e-05, + "loss": 0.2392, + "step": 3380 + }, + { + "epoch": 0.054096, + "grad_norm": 1.75, + "learning_rate": 9.535483870967741e-05, + "loss": 0.1942, + "step": 3381 + }, + { + "epoch": 0.054112, + "grad_norm": 1.1875, + "learning_rate": 9.535322580645161e-05, + "loss": 0.1665, + "step": 3382 + }, + { + "epoch": 0.054128, + "grad_norm": 1.109375, + "learning_rate": 9.53516129032258e-05, + "loss": 0.232, + "step": 3383 + }, + { + "epoch": 0.054144, + "grad_norm": 0.78515625, + "learning_rate": 9.535e-05, + "loss": 0.1804, + "step": 3384 + }, + { + "epoch": 0.05416, + "grad_norm": 0.94921875, + "learning_rate": 9.53483870967742e-05, + "loss": 0.1321, + "step": 3385 + }, + { + "epoch": 0.054176, + "grad_norm": 1.0625, + "learning_rate": 9.53467741935484e-05, + "loss": 0.167, + "step": 3386 + }, + { + "epoch": 0.054192, + "grad_norm": 0.859375, + "learning_rate": 9.534516129032259e-05, + "loss": 0.1849, + "step": 3387 + }, + { + "epoch": 0.054208, + "grad_norm": 1.1484375, + "learning_rate": 9.534354838709679e-05, + "loss": 0.1834, + "step": 3388 + }, + { + "epoch": 0.054224, + "grad_norm": 1.4453125, + "learning_rate": 9.534193548387097e-05, + "loss": 0.228, + "step": 3389 + }, + { + "epoch": 0.05424, + "grad_norm": 1.046875, + "learning_rate": 9.534032258064517e-05, + "loss": 0.1933, + "step": 3390 + }, + { + "epoch": 0.054256, + "grad_norm": 0.5859375, + "learning_rate": 9.533870967741936e-05, + "loss": 0.1486, + "step": 3391 + }, + { + "epoch": 0.054272, + "grad_norm": 2.390625, + "learning_rate": 9.533709677419356e-05, + "loss": 0.2146, + "step": 3392 + }, + { + "epoch": 0.054288, + "grad_norm": 1.2734375, + "learning_rate": 9.533548387096774e-05, + "loss": 0.2158, + "step": 3393 + }, + { + "epoch": 0.054304, + "grad_norm": 1.0859375, + "learning_rate": 9.533387096774194e-05, + "loss": 0.2133, + "step": 3394 + }, + { + "epoch": 0.05432, + "grad_norm": 0.90234375, + "learning_rate": 9.533225806451613e-05, + "loss": 0.1807, + "step": 3395 + }, + { + "epoch": 0.054336, + "grad_norm": 1.359375, + "learning_rate": 9.533064516129033e-05, + "loss": 0.2218, + "step": 3396 + }, + { + "epoch": 0.054352, + "grad_norm": 0.78125, + "learning_rate": 9.532903225806453e-05, + "loss": 0.171, + "step": 3397 + }, + { + "epoch": 0.054368, + "grad_norm": 0.59375, + "learning_rate": 9.532741935483871e-05, + "loss": 0.1916, + "step": 3398 + }, + { + "epoch": 0.054384, + "grad_norm": 0.8671875, + "learning_rate": 9.532580645161291e-05, + "loss": 0.1987, + "step": 3399 + }, + { + "epoch": 0.0544, + "grad_norm": 1.0234375, + "learning_rate": 9.53241935483871e-05, + "loss": 0.1918, + "step": 3400 + }, + { + "epoch": 0.054416, + "grad_norm": 0.84375, + "learning_rate": 9.53225806451613e-05, + "loss": 0.1936, + "step": 3401 + }, + { + "epoch": 0.054432, + "grad_norm": 0.984375, + "learning_rate": 9.532096774193549e-05, + "loss": 0.1591, + "step": 3402 + }, + { + "epoch": 0.054448, + "grad_norm": 0.98828125, + "learning_rate": 9.531935483870968e-05, + "loss": 0.185, + "step": 3403 + }, + { + "epoch": 0.054464, + "grad_norm": 0.6171875, + "learning_rate": 9.531774193548387e-05, + "loss": 0.163, + "step": 3404 + }, + { + "epoch": 0.05448, + "grad_norm": 1.0859375, + "learning_rate": 9.531612903225807e-05, + "loss": 0.1366, + "step": 3405 + }, + { + "epoch": 0.054496, + "grad_norm": 1.078125, + "learning_rate": 9.531451612903226e-05, + "loss": 0.2213, + "step": 3406 + }, + { + "epoch": 0.054512, + "grad_norm": 0.95703125, + "learning_rate": 9.531290322580646e-05, + "loss": 0.1651, + "step": 3407 + }, + { + "epoch": 0.054528, + "grad_norm": 0.9765625, + "learning_rate": 9.531129032258064e-05, + "loss": 0.1903, + "step": 3408 + }, + { + "epoch": 0.054544, + "grad_norm": 1.0390625, + "learning_rate": 9.530967741935484e-05, + "loss": 0.2283, + "step": 3409 + }, + { + "epoch": 0.05456, + "grad_norm": 0.77734375, + "learning_rate": 9.530806451612904e-05, + "loss": 0.1651, + "step": 3410 + }, + { + "epoch": 0.054576, + "grad_norm": 0.984375, + "learning_rate": 9.530645161290324e-05, + "loss": 0.1477, + "step": 3411 + }, + { + "epoch": 0.054592, + "grad_norm": 1.4609375, + "learning_rate": 9.530483870967743e-05, + "loss": 0.221, + "step": 3412 + }, + { + "epoch": 0.054608, + "grad_norm": 0.66015625, + "learning_rate": 9.530322580645161e-05, + "loss": 0.2001, + "step": 3413 + }, + { + "epoch": 0.054624, + "grad_norm": 1.0703125, + "learning_rate": 9.530161290322581e-05, + "loss": 0.2184, + "step": 3414 + }, + { + "epoch": 0.05464, + "grad_norm": 0.62890625, + "learning_rate": 9.53e-05, + "loss": 0.1347, + "step": 3415 + }, + { + "epoch": 0.054656, + "grad_norm": 0.66796875, + "learning_rate": 9.52983870967742e-05, + "loss": 0.1817, + "step": 3416 + }, + { + "epoch": 0.054672, + "grad_norm": 0.8984375, + "learning_rate": 9.529677419354838e-05, + "loss": 0.1966, + "step": 3417 + }, + { + "epoch": 0.054688, + "grad_norm": 1.0234375, + "learning_rate": 9.529516129032258e-05, + "loss": 0.2183, + "step": 3418 + }, + { + "epoch": 0.054704, + "grad_norm": 0.94921875, + "learning_rate": 9.529354838709677e-05, + "loss": 0.2098, + "step": 3419 + }, + { + "epoch": 0.05472, + "grad_norm": 0.84765625, + "learning_rate": 9.529193548387097e-05, + "loss": 0.1897, + "step": 3420 + }, + { + "epoch": 0.054736, + "grad_norm": 0.6484375, + "learning_rate": 9.529032258064517e-05, + "loss": 0.1493, + "step": 3421 + }, + { + "epoch": 0.054752, + "grad_norm": 0.9296875, + "learning_rate": 9.528870967741937e-05, + "loss": 0.171, + "step": 3422 + }, + { + "epoch": 0.054768, + "grad_norm": 0.87890625, + "learning_rate": 9.528709677419356e-05, + "loss": 0.2009, + "step": 3423 + }, + { + "epoch": 0.054784, + "grad_norm": 0.8359375, + "learning_rate": 9.528548387096775e-05, + "loss": 0.1823, + "step": 3424 + }, + { + "epoch": 0.0548, + "grad_norm": 1.1171875, + "learning_rate": 9.528387096774194e-05, + "loss": 0.1655, + "step": 3425 + }, + { + "epoch": 0.054816, + "grad_norm": 0.59375, + "learning_rate": 9.528225806451614e-05, + "loss": 0.164, + "step": 3426 + }, + { + "epoch": 0.054832, + "grad_norm": 0.9765625, + "learning_rate": 9.528064516129033e-05, + "loss": 0.2269, + "step": 3427 + }, + { + "epoch": 0.054848, + "grad_norm": 0.89453125, + "learning_rate": 9.527903225806451e-05, + "loss": 0.19, + "step": 3428 + }, + { + "epoch": 0.054864, + "grad_norm": 0.84375, + "learning_rate": 9.527741935483871e-05, + "loss": 0.1718, + "step": 3429 + }, + { + "epoch": 0.05488, + "grad_norm": 0.671875, + "learning_rate": 9.52758064516129e-05, + "loss": 0.1659, + "step": 3430 + }, + { + "epoch": 0.054896, + "grad_norm": 1.1484375, + "learning_rate": 9.52741935483871e-05, + "loss": 0.2216, + "step": 3431 + }, + { + "epoch": 0.054912, + "grad_norm": 1.4375, + "learning_rate": 9.52725806451613e-05, + "loss": 0.1868, + "step": 3432 + }, + { + "epoch": 0.054928, + "grad_norm": 0.81640625, + "learning_rate": 9.52709677419355e-05, + "loss": 0.1427, + "step": 3433 + }, + { + "epoch": 0.054944, + "grad_norm": 0.8828125, + "learning_rate": 9.526935483870968e-05, + "loss": 0.2076, + "step": 3434 + }, + { + "epoch": 0.05496, + "grad_norm": 0.6953125, + "learning_rate": 9.526774193548388e-05, + "loss": 0.1974, + "step": 3435 + }, + { + "epoch": 0.054976, + "grad_norm": 0.89453125, + "learning_rate": 9.526612903225807e-05, + "loss": 0.1889, + "step": 3436 + }, + { + "epoch": 0.054992, + "grad_norm": 0.52734375, + "learning_rate": 9.526451612903227e-05, + "loss": 0.1678, + "step": 3437 + }, + { + "epoch": 0.055008, + "grad_norm": 0.85546875, + "learning_rate": 9.526290322580645e-05, + "loss": 0.1595, + "step": 3438 + }, + { + "epoch": 0.055024, + "grad_norm": 0.93359375, + "learning_rate": 9.526129032258065e-05, + "loss": 0.1793, + "step": 3439 + }, + { + "epoch": 0.05504, + "grad_norm": 1.375, + "learning_rate": 9.525967741935484e-05, + "loss": 0.1703, + "step": 3440 + }, + { + "epoch": 0.055056, + "grad_norm": 0.69921875, + "learning_rate": 9.525806451612904e-05, + "loss": 0.1884, + "step": 3441 + }, + { + "epoch": 0.055072, + "grad_norm": 1.25, + "learning_rate": 9.525645161290323e-05, + "loss": 0.2299, + "step": 3442 + }, + { + "epoch": 0.055088, + "grad_norm": 0.79296875, + "learning_rate": 9.525483870967741e-05, + "loss": 0.1783, + "step": 3443 + }, + { + "epoch": 0.055104, + "grad_norm": 0.81640625, + "learning_rate": 9.525322580645161e-05, + "loss": 0.1564, + "step": 3444 + }, + { + "epoch": 0.05512, + "grad_norm": 0.79296875, + "learning_rate": 9.525161290322581e-05, + "loss": 0.1046, + "step": 3445 + }, + { + "epoch": 0.055136, + "grad_norm": 1.0625, + "learning_rate": 9.525000000000001e-05, + "loss": 0.181, + "step": 3446 + }, + { + "epoch": 0.055152, + "grad_norm": 0.8125, + "learning_rate": 9.52483870967742e-05, + "loss": 0.2248, + "step": 3447 + }, + { + "epoch": 0.055168, + "grad_norm": 0.96875, + "learning_rate": 9.52467741935484e-05, + "loss": 0.2325, + "step": 3448 + }, + { + "epoch": 0.055184, + "grad_norm": 1.0703125, + "learning_rate": 9.524516129032258e-05, + "loss": 0.1996, + "step": 3449 + }, + { + "epoch": 0.0552, + "grad_norm": 0.734375, + "learning_rate": 9.524354838709678e-05, + "loss": 0.1976, + "step": 3450 + }, + { + "epoch": 0.055216, + "grad_norm": 0.7734375, + "learning_rate": 9.524193548387097e-05, + "loss": 0.1911, + "step": 3451 + }, + { + "epoch": 0.055232, + "grad_norm": 1.046875, + "learning_rate": 9.524032258064517e-05, + "loss": 0.2269, + "step": 3452 + }, + { + "epoch": 0.055248, + "grad_norm": 0.82421875, + "learning_rate": 9.523870967741935e-05, + "loss": 0.2194, + "step": 3453 + }, + { + "epoch": 0.055264, + "grad_norm": 0.61328125, + "learning_rate": 9.523709677419355e-05, + "loss": 0.1434, + "step": 3454 + }, + { + "epoch": 0.05528, + "grad_norm": 0.9140625, + "learning_rate": 9.523548387096774e-05, + "loss": 0.1851, + "step": 3455 + }, + { + "epoch": 0.055296, + "grad_norm": 0.71484375, + "learning_rate": 9.523387096774194e-05, + "loss": 0.1785, + "step": 3456 + }, + { + "epoch": 0.055312, + "grad_norm": 0.71484375, + "learning_rate": 9.523225806451614e-05, + "loss": 0.1916, + "step": 3457 + }, + { + "epoch": 0.055328, + "grad_norm": 0.85546875, + "learning_rate": 9.523064516129033e-05, + "loss": 0.1828, + "step": 3458 + }, + { + "epoch": 0.055344, + "grad_norm": 0.70703125, + "learning_rate": 9.522903225806453e-05, + "loss": 0.1574, + "step": 3459 + }, + { + "epoch": 0.05536, + "grad_norm": 1.2890625, + "learning_rate": 9.522741935483871e-05, + "loss": 0.199, + "step": 3460 + }, + { + "epoch": 0.055376, + "grad_norm": 0.6171875, + "learning_rate": 9.522580645161291e-05, + "loss": 0.1418, + "step": 3461 + }, + { + "epoch": 0.055392, + "grad_norm": 0.88671875, + "learning_rate": 9.52241935483871e-05, + "loss": 0.1935, + "step": 3462 + }, + { + "epoch": 0.055408, + "grad_norm": 1.1015625, + "learning_rate": 9.52225806451613e-05, + "loss": 0.2657, + "step": 3463 + }, + { + "epoch": 0.055424, + "grad_norm": 0.90234375, + "learning_rate": 9.522096774193548e-05, + "loss": 0.1727, + "step": 3464 + }, + { + "epoch": 0.05544, + "grad_norm": 0.91796875, + "learning_rate": 9.521935483870968e-05, + "loss": 0.2032, + "step": 3465 + }, + { + "epoch": 0.055456, + "grad_norm": 0.67578125, + "learning_rate": 9.521774193548387e-05, + "loss": 0.1481, + "step": 3466 + }, + { + "epoch": 0.055472, + "grad_norm": 1.5390625, + "learning_rate": 9.521612903225807e-05, + "loss": 0.1857, + "step": 3467 + }, + { + "epoch": 0.055488, + "grad_norm": 1.671875, + "learning_rate": 9.521451612903225e-05, + "loss": 0.1518, + "step": 3468 + }, + { + "epoch": 0.055504, + "grad_norm": 1.578125, + "learning_rate": 9.521290322580645e-05, + "loss": 0.195, + "step": 3469 + }, + { + "epoch": 0.05552, + "grad_norm": 1.0, + "learning_rate": 9.521129032258065e-05, + "loss": 0.1584, + "step": 3470 + }, + { + "epoch": 0.055536, + "grad_norm": 0.76171875, + "learning_rate": 9.520967741935485e-05, + "loss": 0.1513, + "step": 3471 + }, + { + "epoch": 0.055552, + "grad_norm": 0.8125, + "learning_rate": 9.520806451612904e-05, + "loss": 0.184, + "step": 3472 + }, + { + "epoch": 0.055568, + "grad_norm": 0.80078125, + "learning_rate": 9.520645161290324e-05, + "loss": 0.1692, + "step": 3473 + }, + { + "epoch": 0.055584, + "grad_norm": 0.59765625, + "learning_rate": 9.520483870967742e-05, + "loss": 0.1668, + "step": 3474 + }, + { + "epoch": 0.0556, + "grad_norm": 0.83203125, + "learning_rate": 9.520322580645161e-05, + "loss": 0.1626, + "step": 3475 + }, + { + "epoch": 0.055616, + "grad_norm": 0.73828125, + "learning_rate": 9.520161290322581e-05, + "loss": 0.1899, + "step": 3476 + }, + { + "epoch": 0.055632, + "grad_norm": 0.69140625, + "learning_rate": 9.52e-05, + "loss": 0.1901, + "step": 3477 + }, + { + "epoch": 0.055648, + "grad_norm": 1.3515625, + "learning_rate": 9.51983870967742e-05, + "loss": 0.1637, + "step": 3478 + }, + { + "epoch": 0.055664, + "grad_norm": 0.77734375, + "learning_rate": 9.519677419354838e-05, + "loss": 0.1661, + "step": 3479 + }, + { + "epoch": 0.05568, + "grad_norm": 1.0859375, + "learning_rate": 9.519516129032258e-05, + "loss": 0.2348, + "step": 3480 + }, + { + "epoch": 0.055696, + "grad_norm": 0.6171875, + "learning_rate": 9.519354838709678e-05, + "loss": 0.1416, + "step": 3481 + }, + { + "epoch": 0.055712, + "grad_norm": 1.140625, + "learning_rate": 9.519193548387098e-05, + "loss": 0.1993, + "step": 3482 + }, + { + "epoch": 0.055728, + "grad_norm": 1.296875, + "learning_rate": 9.519032258064517e-05, + "loss": 0.22, + "step": 3483 + }, + { + "epoch": 0.055744, + "grad_norm": 0.72265625, + "learning_rate": 9.518870967741937e-05, + "loss": 0.1433, + "step": 3484 + }, + { + "epoch": 0.05576, + "grad_norm": 0.6796875, + "learning_rate": 9.518709677419355e-05, + "loss": 0.1488, + "step": 3485 + }, + { + "epoch": 0.055776, + "grad_norm": 0.6640625, + "learning_rate": 9.518548387096775e-05, + "loss": 0.1799, + "step": 3486 + }, + { + "epoch": 0.055792, + "grad_norm": 0.90234375, + "learning_rate": 9.518387096774194e-05, + "loss": 0.187, + "step": 3487 + }, + { + "epoch": 0.055808, + "grad_norm": 1.265625, + "learning_rate": 9.518225806451614e-05, + "loss": 0.1775, + "step": 3488 + }, + { + "epoch": 0.055824, + "grad_norm": 0.78515625, + "learning_rate": 9.518064516129032e-05, + "loss": 0.1592, + "step": 3489 + }, + { + "epoch": 0.05584, + "grad_norm": 1.109375, + "learning_rate": 9.517903225806451e-05, + "loss": 0.1901, + "step": 3490 + }, + { + "epoch": 0.055856, + "grad_norm": 0.96875, + "learning_rate": 9.517741935483871e-05, + "loss": 0.21, + "step": 3491 + }, + { + "epoch": 0.055872, + "grad_norm": 1.421875, + "learning_rate": 9.517580645161291e-05, + "loss": 0.2126, + "step": 3492 + }, + { + "epoch": 0.055888, + "grad_norm": 0.8828125, + "learning_rate": 9.517419354838711e-05, + "loss": 0.2192, + "step": 3493 + }, + { + "epoch": 0.055904, + "grad_norm": 0.6796875, + "learning_rate": 9.51725806451613e-05, + "loss": 0.1388, + "step": 3494 + }, + { + "epoch": 0.05592, + "grad_norm": 0.94921875, + "learning_rate": 9.51709677419355e-05, + "loss": 0.1968, + "step": 3495 + }, + { + "epoch": 0.055936, + "grad_norm": 0.7734375, + "learning_rate": 9.516935483870968e-05, + "loss": 0.2005, + "step": 3496 + }, + { + "epoch": 0.055952, + "grad_norm": 0.91015625, + "learning_rate": 9.516774193548388e-05, + "loss": 0.1805, + "step": 3497 + }, + { + "epoch": 0.055968, + "grad_norm": 0.82421875, + "learning_rate": 9.516612903225807e-05, + "loss": 0.176, + "step": 3498 + }, + { + "epoch": 0.055984, + "grad_norm": 0.8125, + "learning_rate": 9.516451612903227e-05, + "loss": 0.1881, + "step": 3499 + }, + { + "epoch": 0.056, + "grad_norm": 0.8203125, + "learning_rate": 9.516290322580645e-05, + "loss": 0.1803, + "step": 3500 + }, + { + "epoch": 0.056016, + "grad_norm": 1.3125, + "learning_rate": 9.516129032258065e-05, + "loss": 0.1645, + "step": 3501 + }, + { + "epoch": 0.056032, + "grad_norm": 0.828125, + "learning_rate": 9.515967741935484e-05, + "loss": 0.1569, + "step": 3502 + }, + { + "epoch": 0.056048, + "grad_norm": 0.84375, + "learning_rate": 9.515806451612904e-05, + "loss": 0.1797, + "step": 3503 + }, + { + "epoch": 0.056064, + "grad_norm": 1.328125, + "learning_rate": 9.515645161290322e-05, + "loss": 0.1208, + "step": 3504 + }, + { + "epoch": 0.05608, + "grad_norm": 0.73046875, + "learning_rate": 9.515483870967742e-05, + "loss": 0.1707, + "step": 3505 + }, + { + "epoch": 0.056096, + "grad_norm": 0.890625, + "learning_rate": 9.515322580645162e-05, + "loss": 0.2071, + "step": 3506 + }, + { + "epoch": 0.056112, + "grad_norm": 0.71484375, + "learning_rate": 9.515161290322581e-05, + "loss": 0.1824, + "step": 3507 + }, + { + "epoch": 0.056128, + "grad_norm": 1.0078125, + "learning_rate": 9.515000000000001e-05, + "loss": 0.2152, + "step": 3508 + }, + { + "epoch": 0.056144, + "grad_norm": 0.9375, + "learning_rate": 9.51483870967742e-05, + "loss": 0.1666, + "step": 3509 + }, + { + "epoch": 0.05616, + "grad_norm": 0.65234375, + "learning_rate": 9.51467741935484e-05, + "loss": 0.1602, + "step": 3510 + }, + { + "epoch": 0.056176, + "grad_norm": 1.0234375, + "learning_rate": 9.514516129032258e-05, + "loss": 0.2069, + "step": 3511 + }, + { + "epoch": 0.056192, + "grad_norm": 1.125, + "learning_rate": 9.514354838709678e-05, + "loss": 0.1859, + "step": 3512 + }, + { + "epoch": 0.056208, + "grad_norm": 1.28125, + "learning_rate": 9.514193548387097e-05, + "loss": 0.1594, + "step": 3513 + }, + { + "epoch": 0.056224, + "grad_norm": 0.6640625, + "learning_rate": 9.514032258064517e-05, + "loss": 0.1528, + "step": 3514 + }, + { + "epoch": 0.05624, + "grad_norm": 1.0703125, + "learning_rate": 9.513870967741935e-05, + "loss": 0.1801, + "step": 3515 + }, + { + "epoch": 0.056256, + "grad_norm": 0.703125, + "learning_rate": 9.513709677419355e-05, + "loss": 0.177, + "step": 3516 + }, + { + "epoch": 0.056272, + "grad_norm": 0.82421875, + "learning_rate": 9.513548387096775e-05, + "loss": 0.1582, + "step": 3517 + }, + { + "epoch": 0.056288, + "grad_norm": 0.66015625, + "learning_rate": 9.513387096774195e-05, + "loss": 0.1565, + "step": 3518 + }, + { + "epoch": 0.056304, + "grad_norm": 0.6953125, + "learning_rate": 9.513225806451614e-05, + "loss": 0.1871, + "step": 3519 + }, + { + "epoch": 0.05632, + "grad_norm": 0.63671875, + "learning_rate": 9.513064516129034e-05, + "loss": 0.1546, + "step": 3520 + }, + { + "epoch": 0.056336, + "grad_norm": 0.7734375, + "learning_rate": 9.512903225806452e-05, + "loss": 0.19, + "step": 3521 + }, + { + "epoch": 0.056352, + "grad_norm": 0.6953125, + "learning_rate": 9.512741935483871e-05, + "loss": 0.1935, + "step": 3522 + }, + { + "epoch": 0.056368, + "grad_norm": 1.1484375, + "learning_rate": 9.512580645161291e-05, + "loss": 0.1618, + "step": 3523 + }, + { + "epoch": 0.056384, + "grad_norm": 1.0078125, + "learning_rate": 9.51241935483871e-05, + "loss": 0.1839, + "step": 3524 + }, + { + "epoch": 0.0564, + "grad_norm": 1.0078125, + "learning_rate": 9.51225806451613e-05, + "loss": 0.2019, + "step": 3525 + }, + { + "epoch": 0.056416, + "grad_norm": 0.81640625, + "learning_rate": 9.512096774193548e-05, + "loss": 0.2208, + "step": 3526 + }, + { + "epoch": 0.056432, + "grad_norm": 0.734375, + "learning_rate": 9.511935483870968e-05, + "loss": 0.1731, + "step": 3527 + }, + { + "epoch": 0.056448, + "grad_norm": 0.7734375, + "learning_rate": 9.511774193548388e-05, + "loss": 0.1995, + "step": 3528 + }, + { + "epoch": 0.056464, + "grad_norm": 1.078125, + "learning_rate": 9.511612903225807e-05, + "loss": 0.2396, + "step": 3529 + }, + { + "epoch": 0.05648, + "grad_norm": 0.95703125, + "learning_rate": 9.511451612903227e-05, + "loss": 0.1854, + "step": 3530 + }, + { + "epoch": 0.056496, + "grad_norm": 1.125, + "learning_rate": 9.511290322580646e-05, + "loss": 0.1754, + "step": 3531 + }, + { + "epoch": 0.056512, + "grad_norm": 0.88671875, + "learning_rate": 9.511129032258065e-05, + "loss": 0.1968, + "step": 3532 + }, + { + "epoch": 0.056528, + "grad_norm": 1.4375, + "learning_rate": 9.510967741935485e-05, + "loss": 0.1713, + "step": 3533 + }, + { + "epoch": 0.056544, + "grad_norm": 0.73046875, + "learning_rate": 9.510806451612904e-05, + "loss": 0.2232, + "step": 3534 + }, + { + "epoch": 0.05656, + "grad_norm": 1.328125, + "learning_rate": 9.510645161290324e-05, + "loss": 0.2106, + "step": 3535 + }, + { + "epoch": 0.056576, + "grad_norm": 0.88671875, + "learning_rate": 9.510483870967742e-05, + "loss": 0.2126, + "step": 3536 + }, + { + "epoch": 0.056592, + "grad_norm": 0.83984375, + "learning_rate": 9.510322580645161e-05, + "loss": 0.1676, + "step": 3537 + }, + { + "epoch": 0.056608, + "grad_norm": 0.78125, + "learning_rate": 9.510161290322581e-05, + "loss": 0.2042, + "step": 3538 + }, + { + "epoch": 0.056624, + "grad_norm": 1.1015625, + "learning_rate": 9.51e-05, + "loss": 0.2299, + "step": 3539 + }, + { + "epoch": 0.05664, + "grad_norm": 0.51953125, + "learning_rate": 9.50983870967742e-05, + "loss": 0.1561, + "step": 3540 + }, + { + "epoch": 0.056656, + "grad_norm": 0.8125, + "learning_rate": 9.50967741935484e-05, + "loss": 0.2502, + "step": 3541 + }, + { + "epoch": 0.056672, + "grad_norm": 1.125, + "learning_rate": 9.509516129032259e-05, + "loss": 0.1719, + "step": 3542 + }, + { + "epoch": 0.056688, + "grad_norm": 1.1171875, + "learning_rate": 9.509354838709678e-05, + "loss": 0.171, + "step": 3543 + }, + { + "epoch": 0.056704, + "grad_norm": 0.79296875, + "learning_rate": 9.509193548387098e-05, + "loss": 0.1867, + "step": 3544 + }, + { + "epoch": 0.05672, + "grad_norm": 0.63671875, + "learning_rate": 9.509032258064516e-05, + "loss": 0.1752, + "step": 3545 + }, + { + "epoch": 0.056736, + "grad_norm": 0.80078125, + "learning_rate": 9.508870967741936e-05, + "loss": 0.1528, + "step": 3546 + }, + { + "epoch": 0.056752, + "grad_norm": 0.796875, + "learning_rate": 9.508709677419355e-05, + "loss": 0.2046, + "step": 3547 + }, + { + "epoch": 0.056768, + "grad_norm": 0.84765625, + "learning_rate": 9.508548387096775e-05, + "loss": 0.2068, + "step": 3548 + }, + { + "epoch": 0.056784, + "grad_norm": 0.72265625, + "learning_rate": 9.508387096774194e-05, + "loss": 0.1903, + "step": 3549 + }, + { + "epoch": 0.0568, + "grad_norm": 0.83203125, + "learning_rate": 9.508225806451614e-05, + "loss": 0.1918, + "step": 3550 + }, + { + "epoch": 0.056816, + "grad_norm": 0.83984375, + "learning_rate": 9.508064516129032e-05, + "loss": 0.181, + "step": 3551 + }, + { + "epoch": 0.056832, + "grad_norm": 1.0703125, + "learning_rate": 9.507903225806452e-05, + "loss": 0.2145, + "step": 3552 + }, + { + "epoch": 0.056848, + "grad_norm": 0.80078125, + "learning_rate": 9.507741935483872e-05, + "loss": 0.1962, + "step": 3553 + }, + { + "epoch": 0.056864, + "grad_norm": 1.0390625, + "learning_rate": 9.507580645161291e-05, + "loss": 0.2129, + "step": 3554 + }, + { + "epoch": 0.05688, + "grad_norm": 0.8203125, + "learning_rate": 9.507419354838711e-05, + "loss": 0.1867, + "step": 3555 + }, + { + "epoch": 0.056896, + "grad_norm": 1.3671875, + "learning_rate": 9.507258064516129e-05, + "loss": 0.2273, + "step": 3556 + }, + { + "epoch": 0.056912, + "grad_norm": 0.890625, + "learning_rate": 9.507096774193549e-05, + "loss": 0.1646, + "step": 3557 + }, + { + "epoch": 0.056928, + "grad_norm": 1.2421875, + "learning_rate": 9.506935483870968e-05, + "loss": 0.2093, + "step": 3558 + }, + { + "epoch": 0.056944, + "grad_norm": 0.76171875, + "learning_rate": 9.506774193548388e-05, + "loss": 0.184, + "step": 3559 + }, + { + "epoch": 0.05696, + "grad_norm": 0.7109375, + "learning_rate": 9.506612903225806e-05, + "loss": 0.2024, + "step": 3560 + }, + { + "epoch": 0.056976, + "grad_norm": 1.3203125, + "learning_rate": 9.506451612903226e-05, + "loss": 0.1647, + "step": 3561 + }, + { + "epoch": 0.056992, + "grad_norm": 0.80078125, + "learning_rate": 9.506290322580645e-05, + "loss": 0.1939, + "step": 3562 + }, + { + "epoch": 0.057008, + "grad_norm": 0.87109375, + "learning_rate": 9.506129032258065e-05, + "loss": 0.1643, + "step": 3563 + }, + { + "epoch": 0.057024, + "grad_norm": 1.2109375, + "learning_rate": 9.505967741935484e-05, + "loss": 0.226, + "step": 3564 + }, + { + "epoch": 0.05704, + "grad_norm": 0.8046875, + "learning_rate": 9.505806451612904e-05, + "loss": 0.1635, + "step": 3565 + }, + { + "epoch": 0.057056, + "grad_norm": 0.87109375, + "learning_rate": 9.505645161290324e-05, + "loss": 0.2098, + "step": 3566 + }, + { + "epoch": 0.057072, + "grad_norm": 0.875, + "learning_rate": 9.505483870967742e-05, + "loss": 0.1887, + "step": 3567 + }, + { + "epoch": 0.057088, + "grad_norm": 0.6171875, + "learning_rate": 9.505322580645162e-05, + "loss": 0.1692, + "step": 3568 + }, + { + "epoch": 0.057104, + "grad_norm": 0.7265625, + "learning_rate": 9.505161290322581e-05, + "loss": 0.1906, + "step": 3569 + }, + { + "epoch": 0.05712, + "grad_norm": 0.96484375, + "learning_rate": 9.505e-05, + "loss": 0.188, + "step": 3570 + }, + { + "epoch": 0.057136, + "grad_norm": 0.703125, + "learning_rate": 9.504838709677419e-05, + "loss": 0.1563, + "step": 3571 + }, + { + "epoch": 0.057152, + "grad_norm": 1.1484375, + "learning_rate": 9.504677419354839e-05, + "loss": 0.177, + "step": 3572 + }, + { + "epoch": 0.057168, + "grad_norm": 0.8515625, + "learning_rate": 9.504516129032258e-05, + "loss": 0.1226, + "step": 3573 + }, + { + "epoch": 0.057184, + "grad_norm": 1.59375, + "learning_rate": 9.504354838709678e-05, + "loss": 0.1931, + "step": 3574 + }, + { + "epoch": 0.0572, + "grad_norm": 0.89453125, + "learning_rate": 9.504193548387096e-05, + "loss": 0.1907, + "step": 3575 + }, + { + "epoch": 0.057216, + "grad_norm": 0.703125, + "learning_rate": 9.504032258064516e-05, + "loss": 0.1549, + "step": 3576 + }, + { + "epoch": 0.057232, + "grad_norm": 0.76171875, + "learning_rate": 9.503870967741936e-05, + "loss": 0.1708, + "step": 3577 + }, + { + "epoch": 0.057248, + "grad_norm": 1.140625, + "learning_rate": 9.503709677419356e-05, + "loss": 0.2089, + "step": 3578 + }, + { + "epoch": 0.057264, + "grad_norm": 0.890625, + "learning_rate": 9.503548387096775e-05, + "loss": 0.1692, + "step": 3579 + }, + { + "epoch": 0.05728, + "grad_norm": 0.78515625, + "learning_rate": 9.503387096774195e-05, + "loss": 0.2087, + "step": 3580 + }, + { + "epoch": 0.057296, + "grad_norm": 0.75390625, + "learning_rate": 9.503225806451613e-05, + "loss": 0.2011, + "step": 3581 + }, + { + "epoch": 0.057312, + "grad_norm": 1.7578125, + "learning_rate": 9.503064516129033e-05, + "loss": 0.1608, + "step": 3582 + }, + { + "epoch": 0.057328, + "grad_norm": 1.28125, + "learning_rate": 9.502903225806452e-05, + "loss": 0.1885, + "step": 3583 + }, + { + "epoch": 0.057344, + "grad_norm": 1.109375, + "learning_rate": 9.50274193548387e-05, + "loss": 0.1511, + "step": 3584 + }, + { + "epoch": 0.05736, + "grad_norm": 1.1171875, + "learning_rate": 9.50258064516129e-05, + "loss": 0.197, + "step": 3585 + }, + { + "epoch": 0.057376, + "grad_norm": 0.6953125, + "learning_rate": 9.502419354838709e-05, + "loss": 0.1897, + "step": 3586 + }, + { + "epoch": 0.057392, + "grad_norm": 0.6953125, + "learning_rate": 9.502258064516129e-05, + "loss": 0.1853, + "step": 3587 + }, + { + "epoch": 0.057408, + "grad_norm": 0.83203125, + "learning_rate": 9.502096774193549e-05, + "loss": 0.1765, + "step": 3588 + }, + { + "epoch": 0.057424, + "grad_norm": 1.1640625, + "learning_rate": 9.501935483870969e-05, + "loss": 0.1975, + "step": 3589 + }, + { + "epoch": 0.05744, + "grad_norm": 0.98828125, + "learning_rate": 9.501774193548388e-05, + "loss": 0.1851, + "step": 3590 + }, + { + "epoch": 0.057456, + "grad_norm": 0.73828125, + "learning_rate": 9.501612903225808e-05, + "loss": 0.1936, + "step": 3591 + }, + { + "epoch": 0.057472, + "grad_norm": 0.921875, + "learning_rate": 9.501451612903226e-05, + "loss": 0.2256, + "step": 3592 + }, + { + "epoch": 0.057488, + "grad_norm": 0.81640625, + "learning_rate": 9.501290322580646e-05, + "loss": 0.1593, + "step": 3593 + }, + { + "epoch": 0.057504, + "grad_norm": 0.90625, + "learning_rate": 9.501129032258065e-05, + "loss": 0.1948, + "step": 3594 + }, + { + "epoch": 0.05752, + "grad_norm": 0.796875, + "learning_rate": 9.500967741935485e-05, + "loss": 0.1936, + "step": 3595 + }, + { + "epoch": 0.057536, + "grad_norm": 0.87109375, + "learning_rate": 9.500806451612903e-05, + "loss": 0.174, + "step": 3596 + }, + { + "epoch": 0.057552, + "grad_norm": 0.86328125, + "learning_rate": 9.500645161290323e-05, + "loss": 0.1905, + "step": 3597 + }, + { + "epoch": 0.057568, + "grad_norm": 1.6875, + "learning_rate": 9.500483870967742e-05, + "loss": 0.1767, + "step": 3598 + }, + { + "epoch": 0.057584, + "grad_norm": 2.6875, + "learning_rate": 9.50032258064516e-05, + "loss": 0.1654, + "step": 3599 + }, + { + "epoch": 0.0576, + "grad_norm": 0.609375, + "learning_rate": 9.50016129032258e-05, + "loss": 0.1684, + "step": 3600 + }, + { + "epoch": 0.057616, + "grad_norm": 0.83203125, + "learning_rate": 9.5e-05, + "loss": 0.1965, + "step": 3601 + }, + { + "epoch": 0.057632, + "grad_norm": 0.8359375, + "learning_rate": 9.49983870967742e-05, + "loss": 0.1899, + "step": 3602 + }, + { + "epoch": 0.057648, + "grad_norm": 1.2265625, + "learning_rate": 9.499677419354839e-05, + "loss": 0.1725, + "step": 3603 + }, + { + "epoch": 0.057664, + "grad_norm": 0.8046875, + "learning_rate": 9.499516129032259e-05, + "loss": 0.2046, + "step": 3604 + }, + { + "epoch": 0.05768, + "grad_norm": 1.1484375, + "learning_rate": 9.499354838709678e-05, + "loss": 0.1902, + "step": 3605 + }, + { + "epoch": 0.057696, + "grad_norm": 0.89453125, + "learning_rate": 9.499193548387098e-05, + "loss": 0.2385, + "step": 3606 + }, + { + "epoch": 0.057712, + "grad_norm": 0.796875, + "learning_rate": 9.499032258064516e-05, + "loss": 0.1903, + "step": 3607 + }, + { + "epoch": 0.057728, + "grad_norm": 0.71875, + "learning_rate": 9.498870967741936e-05, + "loss": 0.1681, + "step": 3608 + }, + { + "epoch": 0.057744, + "grad_norm": 1.0078125, + "learning_rate": 9.498709677419355e-05, + "loss": 0.2332, + "step": 3609 + }, + { + "epoch": 0.05776, + "grad_norm": 0.7265625, + "learning_rate": 9.498548387096775e-05, + "loss": 0.1879, + "step": 3610 + }, + { + "epoch": 0.057776, + "grad_norm": 0.65625, + "learning_rate": 9.498387096774193e-05, + "loss": 0.1789, + "step": 3611 + }, + { + "epoch": 0.057792, + "grad_norm": 0.7890625, + "learning_rate": 9.498225806451613e-05, + "loss": 0.1764, + "step": 3612 + }, + { + "epoch": 0.057808, + "grad_norm": 0.90234375, + "learning_rate": 9.498064516129033e-05, + "loss": 0.2083, + "step": 3613 + }, + { + "epoch": 0.057824, + "grad_norm": 0.875, + "learning_rate": 9.497903225806452e-05, + "loss": 0.2164, + "step": 3614 + }, + { + "epoch": 0.05784, + "grad_norm": 1.15625, + "learning_rate": 9.497741935483872e-05, + "loss": 0.2491, + "step": 3615 + }, + { + "epoch": 0.057856, + "grad_norm": 0.97265625, + "learning_rate": 9.49758064516129e-05, + "loss": 0.1916, + "step": 3616 + }, + { + "epoch": 0.057872, + "grad_norm": 0.8828125, + "learning_rate": 9.49741935483871e-05, + "loss": 0.1492, + "step": 3617 + }, + { + "epoch": 0.057888, + "grad_norm": 1.0625, + "learning_rate": 9.497258064516129e-05, + "loss": 0.209, + "step": 3618 + }, + { + "epoch": 0.057904, + "grad_norm": 0.8125, + "learning_rate": 9.497096774193549e-05, + "loss": 0.1569, + "step": 3619 + }, + { + "epoch": 0.05792, + "grad_norm": 0.66796875, + "learning_rate": 9.496935483870968e-05, + "loss": 0.1817, + "step": 3620 + }, + { + "epoch": 0.057936, + "grad_norm": 1.3203125, + "learning_rate": 9.496774193548388e-05, + "loss": 0.2314, + "step": 3621 + }, + { + "epoch": 0.057952, + "grad_norm": 0.8828125, + "learning_rate": 9.496612903225806e-05, + "loss": 0.1744, + "step": 3622 + }, + { + "epoch": 0.057968, + "grad_norm": 0.9296875, + "learning_rate": 9.496451612903226e-05, + "loss": 0.1763, + "step": 3623 + }, + { + "epoch": 0.057984, + "grad_norm": 0.78125, + "learning_rate": 9.496290322580645e-05, + "loss": 0.1855, + "step": 3624 + }, + { + "epoch": 0.058, + "grad_norm": 1.0078125, + "learning_rate": 9.496129032258065e-05, + "loss": 0.2302, + "step": 3625 + }, + { + "epoch": 0.058016, + "grad_norm": 0.80859375, + "learning_rate": 9.495967741935485e-05, + "loss": 0.1953, + "step": 3626 + }, + { + "epoch": 0.058032, + "grad_norm": 1.0703125, + "learning_rate": 9.495806451612905e-05, + "loss": 0.197, + "step": 3627 + }, + { + "epoch": 0.058048, + "grad_norm": 0.87890625, + "learning_rate": 9.495645161290323e-05, + "loss": 0.1793, + "step": 3628 + }, + { + "epoch": 0.058064, + "grad_norm": 0.859375, + "learning_rate": 9.495483870967743e-05, + "loss": 0.1708, + "step": 3629 + }, + { + "epoch": 0.05808, + "grad_norm": 1.296875, + "learning_rate": 9.495322580645162e-05, + "loss": 0.2104, + "step": 3630 + }, + { + "epoch": 0.058096, + "grad_norm": 0.84765625, + "learning_rate": 9.49516129032258e-05, + "loss": 0.225, + "step": 3631 + }, + { + "epoch": 0.058112, + "grad_norm": 0.80078125, + "learning_rate": 9.495e-05, + "loss": 0.139, + "step": 3632 + }, + { + "epoch": 0.058128, + "grad_norm": 0.89453125, + "learning_rate": 9.494838709677419e-05, + "loss": 0.1865, + "step": 3633 + }, + { + "epoch": 0.058144, + "grad_norm": 0.7109375, + "learning_rate": 9.494677419354839e-05, + "loss": 0.1689, + "step": 3634 + }, + { + "epoch": 0.05816, + "grad_norm": 0.70703125, + "learning_rate": 9.494516129032258e-05, + "loss": 0.1662, + "step": 3635 + }, + { + "epoch": 0.058176, + "grad_norm": 0.83984375, + "learning_rate": 9.494354838709678e-05, + "loss": 0.1735, + "step": 3636 + }, + { + "epoch": 0.058192, + "grad_norm": 0.984375, + "learning_rate": 9.494193548387098e-05, + "loss": 0.1698, + "step": 3637 + }, + { + "epoch": 0.058208, + "grad_norm": 0.796875, + "learning_rate": 9.494032258064517e-05, + "loss": 0.159, + "step": 3638 + }, + { + "epoch": 0.058224, + "grad_norm": 0.78125, + "learning_rate": 9.493870967741936e-05, + "loss": 0.1767, + "step": 3639 + }, + { + "epoch": 0.05824, + "grad_norm": 1.0, + "learning_rate": 9.493709677419356e-05, + "loss": 0.1685, + "step": 3640 + }, + { + "epoch": 0.058256, + "grad_norm": 0.80078125, + "learning_rate": 9.493548387096775e-05, + "loss": 0.2093, + "step": 3641 + }, + { + "epoch": 0.058272, + "grad_norm": 0.703125, + "learning_rate": 9.493387096774195e-05, + "loss": 0.1622, + "step": 3642 + }, + { + "epoch": 0.058288, + "grad_norm": 0.87890625, + "learning_rate": 9.493225806451613e-05, + "loss": 0.1855, + "step": 3643 + }, + { + "epoch": 0.058304, + "grad_norm": 0.80078125, + "learning_rate": 9.493064516129033e-05, + "loss": 0.1657, + "step": 3644 + }, + { + "epoch": 0.05832, + "grad_norm": 0.7734375, + "learning_rate": 9.492903225806452e-05, + "loss": 0.2032, + "step": 3645 + }, + { + "epoch": 0.058336, + "grad_norm": 0.9765625, + "learning_rate": 9.49274193548387e-05, + "loss": 0.1883, + "step": 3646 + }, + { + "epoch": 0.058352, + "grad_norm": 0.9375, + "learning_rate": 9.49258064516129e-05, + "loss": 0.192, + "step": 3647 + }, + { + "epoch": 0.058368, + "grad_norm": 0.9609375, + "learning_rate": 9.49241935483871e-05, + "loss": 0.2044, + "step": 3648 + }, + { + "epoch": 0.058384, + "grad_norm": 0.81640625, + "learning_rate": 9.49225806451613e-05, + "loss": 0.1843, + "step": 3649 + }, + { + "epoch": 0.0584, + "grad_norm": 0.92578125, + "learning_rate": 9.492096774193549e-05, + "loss": 0.1879, + "step": 3650 + }, + { + "epoch": 0.058416, + "grad_norm": 0.63671875, + "learning_rate": 9.491935483870969e-05, + "loss": 0.1704, + "step": 3651 + }, + { + "epoch": 0.058432, + "grad_norm": 0.8046875, + "learning_rate": 9.491774193548387e-05, + "loss": 0.1905, + "step": 3652 + }, + { + "epoch": 0.058448, + "grad_norm": 0.79296875, + "learning_rate": 9.491612903225807e-05, + "loss": 0.2119, + "step": 3653 + }, + { + "epoch": 0.058464, + "grad_norm": 1.859375, + "learning_rate": 9.491451612903226e-05, + "loss": 0.2264, + "step": 3654 + }, + { + "epoch": 0.05848, + "grad_norm": 0.87109375, + "learning_rate": 9.491290322580646e-05, + "loss": 0.1809, + "step": 3655 + }, + { + "epoch": 0.058496, + "grad_norm": 1.0078125, + "learning_rate": 9.491129032258065e-05, + "loss": 0.1661, + "step": 3656 + }, + { + "epoch": 0.058512, + "grad_norm": 1.2890625, + "learning_rate": 9.490967741935485e-05, + "loss": 0.2051, + "step": 3657 + }, + { + "epoch": 0.058528, + "grad_norm": 0.9140625, + "learning_rate": 9.490806451612903e-05, + "loss": 0.1735, + "step": 3658 + }, + { + "epoch": 0.058544, + "grad_norm": 0.77734375, + "learning_rate": 9.490645161290323e-05, + "loss": 0.1499, + "step": 3659 + }, + { + "epoch": 0.05856, + "grad_norm": 1.0234375, + "learning_rate": 9.490483870967742e-05, + "loss": 0.2427, + "step": 3660 + }, + { + "epoch": 0.058576, + "grad_norm": 0.60546875, + "learning_rate": 9.490322580645162e-05, + "loss": 0.1516, + "step": 3661 + }, + { + "epoch": 0.058592, + "grad_norm": 0.7734375, + "learning_rate": 9.490161290322582e-05, + "loss": 0.1541, + "step": 3662 + }, + { + "epoch": 0.058608, + "grad_norm": 0.78125, + "learning_rate": 9.49e-05, + "loss": 0.2113, + "step": 3663 + }, + { + "epoch": 0.058624, + "grad_norm": 0.8828125, + "learning_rate": 9.48983870967742e-05, + "loss": 0.169, + "step": 3664 + }, + { + "epoch": 0.05864, + "grad_norm": 0.9140625, + "learning_rate": 9.489677419354839e-05, + "loss": 0.2052, + "step": 3665 + }, + { + "epoch": 0.058656, + "grad_norm": 0.6640625, + "learning_rate": 9.489516129032259e-05, + "loss": 0.1849, + "step": 3666 + }, + { + "epoch": 0.058672, + "grad_norm": 0.91015625, + "learning_rate": 9.489354838709677e-05, + "loss": 0.1532, + "step": 3667 + }, + { + "epoch": 0.058688, + "grad_norm": 1.0859375, + "learning_rate": 9.489193548387097e-05, + "loss": 0.2126, + "step": 3668 + }, + { + "epoch": 0.058704, + "grad_norm": 1.3671875, + "learning_rate": 9.489032258064516e-05, + "loss": 0.2046, + "step": 3669 + }, + { + "epoch": 0.05872, + "grad_norm": 1.25, + "learning_rate": 9.488870967741936e-05, + "loss": 0.1892, + "step": 3670 + }, + { + "epoch": 0.058736, + "grad_norm": 0.82421875, + "learning_rate": 9.488709677419355e-05, + "loss": 0.1395, + "step": 3671 + }, + { + "epoch": 0.058752, + "grad_norm": 1.203125, + "learning_rate": 9.488548387096775e-05, + "loss": 0.2222, + "step": 3672 + }, + { + "epoch": 0.058768, + "grad_norm": 0.91796875, + "learning_rate": 9.488387096774194e-05, + "loss": 0.1752, + "step": 3673 + }, + { + "epoch": 0.058784, + "grad_norm": 1.1484375, + "learning_rate": 9.488225806451614e-05, + "loss": 0.2136, + "step": 3674 + }, + { + "epoch": 0.0588, + "grad_norm": 1.375, + "learning_rate": 9.488064516129033e-05, + "loss": 0.1909, + "step": 3675 + }, + { + "epoch": 0.058816, + "grad_norm": 0.76953125, + "learning_rate": 9.487903225806452e-05, + "loss": 0.1822, + "step": 3676 + }, + { + "epoch": 0.058832, + "grad_norm": 0.91015625, + "learning_rate": 9.487741935483872e-05, + "loss": 0.1565, + "step": 3677 + }, + { + "epoch": 0.058848, + "grad_norm": 1.125, + "learning_rate": 9.48758064516129e-05, + "loss": 0.2324, + "step": 3678 + }, + { + "epoch": 0.058864, + "grad_norm": 0.77734375, + "learning_rate": 9.48741935483871e-05, + "loss": 0.1949, + "step": 3679 + }, + { + "epoch": 0.05888, + "grad_norm": 1.1484375, + "learning_rate": 9.487258064516129e-05, + "loss": 0.1771, + "step": 3680 + }, + { + "epoch": 0.058896, + "grad_norm": 0.875, + "learning_rate": 9.487096774193549e-05, + "loss": 0.2247, + "step": 3681 + }, + { + "epoch": 0.058912, + "grad_norm": 0.6796875, + "learning_rate": 9.486935483870967e-05, + "loss": 0.1718, + "step": 3682 + }, + { + "epoch": 0.058928, + "grad_norm": 1.5546875, + "learning_rate": 9.486774193548387e-05, + "loss": 0.1636, + "step": 3683 + }, + { + "epoch": 0.058944, + "grad_norm": 1.0390625, + "learning_rate": 9.486612903225807e-05, + "loss": 0.204, + "step": 3684 + }, + { + "epoch": 0.05896, + "grad_norm": 0.86328125, + "learning_rate": 9.486451612903227e-05, + "loss": 0.2044, + "step": 3685 + }, + { + "epoch": 0.058976, + "grad_norm": 1.5859375, + "learning_rate": 9.486290322580646e-05, + "loss": 0.2047, + "step": 3686 + }, + { + "epoch": 0.058992, + "grad_norm": 0.7734375, + "learning_rate": 9.486129032258066e-05, + "loss": 0.1767, + "step": 3687 + }, + { + "epoch": 0.059008, + "grad_norm": 1.0078125, + "learning_rate": 9.485967741935484e-05, + "loss": 0.1809, + "step": 3688 + }, + { + "epoch": 0.059024, + "grad_norm": 1.1171875, + "learning_rate": 9.485806451612904e-05, + "loss": 0.2284, + "step": 3689 + }, + { + "epoch": 0.05904, + "grad_norm": 0.671875, + "learning_rate": 9.485645161290323e-05, + "loss": 0.186, + "step": 3690 + }, + { + "epoch": 0.059056, + "grad_norm": 0.62109375, + "learning_rate": 9.485483870967743e-05, + "loss": 0.1482, + "step": 3691 + }, + { + "epoch": 0.059072, + "grad_norm": 0.734375, + "learning_rate": 9.485322580645162e-05, + "loss": 0.1455, + "step": 3692 + }, + { + "epoch": 0.059088, + "grad_norm": 0.93359375, + "learning_rate": 9.48516129032258e-05, + "loss": 0.1941, + "step": 3693 + }, + { + "epoch": 0.059104, + "grad_norm": 0.91015625, + "learning_rate": 9.485e-05, + "loss": 0.1916, + "step": 3694 + }, + { + "epoch": 0.05912, + "grad_norm": 0.72265625, + "learning_rate": 9.484838709677419e-05, + "loss": 0.17, + "step": 3695 + }, + { + "epoch": 0.059136, + "grad_norm": 1.2421875, + "learning_rate": 9.484677419354839e-05, + "loss": 0.1951, + "step": 3696 + }, + { + "epoch": 0.059152, + "grad_norm": 1.109375, + "learning_rate": 9.484516129032259e-05, + "loss": 0.1517, + "step": 3697 + }, + { + "epoch": 0.059168, + "grad_norm": 0.734375, + "learning_rate": 9.484354838709679e-05, + "loss": 0.1398, + "step": 3698 + }, + { + "epoch": 0.059184, + "grad_norm": 0.7578125, + "learning_rate": 9.484193548387097e-05, + "loss": 0.1828, + "step": 3699 + }, + { + "epoch": 0.0592, + "grad_norm": 0.76953125, + "learning_rate": 9.484032258064517e-05, + "loss": 0.2094, + "step": 3700 + }, + { + "epoch": 0.059216, + "grad_norm": 1.0859375, + "learning_rate": 9.483870967741936e-05, + "loss": 0.1911, + "step": 3701 + }, + { + "epoch": 0.059232, + "grad_norm": 1.5546875, + "learning_rate": 9.483709677419356e-05, + "loss": 0.2129, + "step": 3702 + }, + { + "epoch": 0.059248, + "grad_norm": 0.80078125, + "learning_rate": 9.483548387096774e-05, + "loss": 0.1816, + "step": 3703 + }, + { + "epoch": 0.059264, + "grad_norm": 1.0625, + "learning_rate": 9.483387096774194e-05, + "loss": 0.1633, + "step": 3704 + }, + { + "epoch": 0.05928, + "grad_norm": 0.921875, + "learning_rate": 9.483225806451613e-05, + "loss": 0.1897, + "step": 3705 + }, + { + "epoch": 0.059296, + "grad_norm": 0.9296875, + "learning_rate": 9.483064516129033e-05, + "loss": 0.1912, + "step": 3706 + }, + { + "epoch": 0.059312, + "grad_norm": 0.9921875, + "learning_rate": 9.482903225806452e-05, + "loss": 0.1665, + "step": 3707 + }, + { + "epoch": 0.059328, + "grad_norm": 0.80859375, + "learning_rate": 9.482741935483872e-05, + "loss": 0.1745, + "step": 3708 + }, + { + "epoch": 0.059344, + "grad_norm": 1.0234375, + "learning_rate": 9.482580645161291e-05, + "loss": 0.2302, + "step": 3709 + }, + { + "epoch": 0.05936, + "grad_norm": 1.46875, + "learning_rate": 9.48241935483871e-05, + "loss": 0.1659, + "step": 3710 + }, + { + "epoch": 0.059376, + "grad_norm": 2.0, + "learning_rate": 9.48225806451613e-05, + "loss": 0.2223, + "step": 3711 + }, + { + "epoch": 0.059392, + "grad_norm": 0.7890625, + "learning_rate": 9.482096774193549e-05, + "loss": 0.1995, + "step": 3712 + }, + { + "epoch": 0.059408, + "grad_norm": 1.4140625, + "learning_rate": 9.481935483870969e-05, + "loss": 0.2034, + "step": 3713 + }, + { + "epoch": 0.059424, + "grad_norm": 1.21875, + "learning_rate": 9.481774193548387e-05, + "loss": 0.1724, + "step": 3714 + }, + { + "epoch": 0.05944, + "grad_norm": 0.67578125, + "learning_rate": 9.481612903225807e-05, + "loss": 0.1816, + "step": 3715 + }, + { + "epoch": 0.059456, + "grad_norm": 0.98046875, + "learning_rate": 9.481451612903226e-05, + "loss": 0.1916, + "step": 3716 + }, + { + "epoch": 0.059472, + "grad_norm": 0.58984375, + "learning_rate": 9.481290322580646e-05, + "loss": 0.1641, + "step": 3717 + }, + { + "epoch": 0.059488, + "grad_norm": 0.79296875, + "learning_rate": 9.481129032258064e-05, + "loss": 0.1854, + "step": 3718 + }, + { + "epoch": 0.059504, + "grad_norm": 1.1796875, + "learning_rate": 9.480967741935484e-05, + "loss": 0.1922, + "step": 3719 + }, + { + "epoch": 0.05952, + "grad_norm": 0.69921875, + "learning_rate": 9.480806451612903e-05, + "loss": 0.177, + "step": 3720 + }, + { + "epoch": 0.059536, + "grad_norm": 0.92578125, + "learning_rate": 9.480645161290323e-05, + "loss": 0.1675, + "step": 3721 + }, + { + "epoch": 0.059552, + "grad_norm": 1.8046875, + "learning_rate": 9.480483870967743e-05, + "loss": 0.2013, + "step": 3722 + }, + { + "epoch": 0.059568, + "grad_norm": 1.5390625, + "learning_rate": 9.480322580645161e-05, + "loss": 0.2062, + "step": 3723 + }, + { + "epoch": 0.059584, + "grad_norm": 1.0703125, + "learning_rate": 9.480161290322581e-05, + "loss": 0.1974, + "step": 3724 + }, + { + "epoch": 0.0596, + "grad_norm": 0.59765625, + "learning_rate": 9.48e-05, + "loss": 0.1554, + "step": 3725 + }, + { + "epoch": 0.059616, + "grad_norm": 0.6875, + "learning_rate": 9.47983870967742e-05, + "loss": 0.1841, + "step": 3726 + }, + { + "epoch": 0.059632, + "grad_norm": 1.609375, + "learning_rate": 9.479677419354839e-05, + "loss": 0.1816, + "step": 3727 + }, + { + "epoch": 0.059648, + "grad_norm": 1.859375, + "learning_rate": 9.479516129032259e-05, + "loss": 0.1714, + "step": 3728 + }, + { + "epoch": 0.059664, + "grad_norm": 1.1484375, + "learning_rate": 9.479354838709677e-05, + "loss": 0.1917, + "step": 3729 + }, + { + "epoch": 0.05968, + "grad_norm": 0.78125, + "learning_rate": 9.479193548387097e-05, + "loss": 0.1713, + "step": 3730 + }, + { + "epoch": 0.059696, + "grad_norm": 1.1484375, + "learning_rate": 9.479032258064516e-05, + "loss": 0.1859, + "step": 3731 + }, + { + "epoch": 0.059712, + "grad_norm": 0.98046875, + "learning_rate": 9.478870967741936e-05, + "loss": 0.1499, + "step": 3732 + }, + { + "epoch": 0.059728, + "grad_norm": 0.92578125, + "learning_rate": 9.478709677419356e-05, + "loss": 0.1969, + "step": 3733 + }, + { + "epoch": 0.059744, + "grad_norm": 0.98046875, + "learning_rate": 9.478548387096776e-05, + "loss": 0.155, + "step": 3734 + }, + { + "epoch": 0.05976, + "grad_norm": 0.75, + "learning_rate": 9.478387096774194e-05, + "loss": 0.1958, + "step": 3735 + }, + { + "epoch": 0.059776, + "grad_norm": 0.93359375, + "learning_rate": 9.478225806451614e-05, + "loss": 0.1742, + "step": 3736 + }, + { + "epoch": 0.059792, + "grad_norm": 1.0078125, + "learning_rate": 9.478064516129033e-05, + "loss": 0.1682, + "step": 3737 + }, + { + "epoch": 0.059808, + "grad_norm": 0.7421875, + "learning_rate": 9.477903225806451e-05, + "loss": 0.1599, + "step": 3738 + }, + { + "epoch": 0.059824, + "grad_norm": 0.77734375, + "learning_rate": 9.477741935483871e-05, + "loss": 0.1381, + "step": 3739 + }, + { + "epoch": 0.05984, + "grad_norm": 1.46875, + "learning_rate": 9.47758064516129e-05, + "loss": 0.2187, + "step": 3740 + }, + { + "epoch": 0.059856, + "grad_norm": 0.75, + "learning_rate": 9.47741935483871e-05, + "loss": 0.1627, + "step": 3741 + }, + { + "epoch": 0.059872, + "grad_norm": 0.66015625, + "learning_rate": 9.477258064516129e-05, + "loss": 0.1604, + "step": 3742 + }, + { + "epoch": 0.059888, + "grad_norm": 0.72265625, + "learning_rate": 9.477096774193549e-05, + "loss": 0.14, + "step": 3743 + }, + { + "epoch": 0.059904, + "grad_norm": 0.953125, + "learning_rate": 9.476935483870968e-05, + "loss": 0.2245, + "step": 3744 + }, + { + "epoch": 0.05992, + "grad_norm": 0.80078125, + "learning_rate": 9.476774193548388e-05, + "loss": 0.1689, + "step": 3745 + }, + { + "epoch": 0.059936, + "grad_norm": 0.7734375, + "learning_rate": 9.476612903225807e-05, + "loss": 0.1814, + "step": 3746 + }, + { + "epoch": 0.059952, + "grad_norm": 0.91015625, + "learning_rate": 9.476451612903227e-05, + "loss": 0.2279, + "step": 3747 + }, + { + "epoch": 0.059968, + "grad_norm": 0.90625, + "learning_rate": 9.476290322580646e-05, + "loss": 0.2161, + "step": 3748 + }, + { + "epoch": 0.059984, + "grad_norm": 1.1640625, + "learning_rate": 9.476129032258066e-05, + "loss": 0.2082, + "step": 3749 + }, + { + "epoch": 0.06, + "grad_norm": 0.9453125, + "learning_rate": 9.475967741935484e-05, + "loss": 0.1345, + "step": 3750 + }, + { + "epoch": 0.060016, + "grad_norm": 0.73046875, + "learning_rate": 9.475806451612904e-05, + "loss": 0.1783, + "step": 3751 + }, + { + "epoch": 0.060032, + "grad_norm": 0.7890625, + "learning_rate": 9.475645161290323e-05, + "loss": 0.1756, + "step": 3752 + }, + { + "epoch": 0.060048, + "grad_norm": 1.4140625, + "learning_rate": 9.475483870967743e-05, + "loss": 0.2194, + "step": 3753 + }, + { + "epoch": 0.060064, + "grad_norm": 1.125, + "learning_rate": 9.475322580645161e-05, + "loss": 0.175, + "step": 3754 + }, + { + "epoch": 0.06008, + "grad_norm": 1.0078125, + "learning_rate": 9.47516129032258e-05, + "loss": 0.1946, + "step": 3755 + }, + { + "epoch": 0.060096, + "grad_norm": 1.0546875, + "learning_rate": 9.475e-05, + "loss": 0.211, + "step": 3756 + }, + { + "epoch": 0.060112, + "grad_norm": 0.8828125, + "learning_rate": 9.47483870967742e-05, + "loss": 0.191, + "step": 3757 + }, + { + "epoch": 0.060128, + "grad_norm": 0.82421875, + "learning_rate": 9.47467741935484e-05, + "loss": 0.1976, + "step": 3758 + }, + { + "epoch": 0.060144, + "grad_norm": 1.1015625, + "learning_rate": 9.474516129032258e-05, + "loss": 0.1804, + "step": 3759 + }, + { + "epoch": 0.06016, + "grad_norm": 0.7734375, + "learning_rate": 9.474354838709678e-05, + "loss": 0.1727, + "step": 3760 + }, + { + "epoch": 0.060176, + "grad_norm": 1.5859375, + "learning_rate": 9.474193548387097e-05, + "loss": 0.1398, + "step": 3761 + }, + { + "epoch": 0.060192, + "grad_norm": 1.0859375, + "learning_rate": 9.474032258064517e-05, + "loss": 0.1728, + "step": 3762 + }, + { + "epoch": 0.060208, + "grad_norm": 1.875, + "learning_rate": 9.473870967741936e-05, + "loss": 0.1995, + "step": 3763 + }, + { + "epoch": 0.060224, + "grad_norm": 1.078125, + "learning_rate": 9.473709677419356e-05, + "loss": 0.1956, + "step": 3764 + }, + { + "epoch": 0.06024, + "grad_norm": 1.21875, + "learning_rate": 9.473548387096774e-05, + "loss": 0.2543, + "step": 3765 + }, + { + "epoch": 0.060256, + "grad_norm": 0.76953125, + "learning_rate": 9.473387096774194e-05, + "loss": 0.1986, + "step": 3766 + }, + { + "epoch": 0.060272, + "grad_norm": 2.1875, + "learning_rate": 9.473225806451613e-05, + "loss": 0.2208, + "step": 3767 + }, + { + "epoch": 0.060288, + "grad_norm": 1.46875, + "learning_rate": 9.473064516129033e-05, + "loss": 0.1977, + "step": 3768 + }, + { + "epoch": 0.060304, + "grad_norm": 0.87109375, + "learning_rate": 9.472903225806453e-05, + "loss": 0.1776, + "step": 3769 + }, + { + "epoch": 0.06032, + "grad_norm": 1.8203125, + "learning_rate": 9.472741935483871e-05, + "loss": 0.2054, + "step": 3770 + }, + { + "epoch": 0.060336, + "grad_norm": 0.8515625, + "learning_rate": 9.472580645161291e-05, + "loss": 0.1907, + "step": 3771 + }, + { + "epoch": 0.060352, + "grad_norm": 0.7265625, + "learning_rate": 9.47241935483871e-05, + "loss": 0.1509, + "step": 3772 + }, + { + "epoch": 0.060368, + "grad_norm": 0.87890625, + "learning_rate": 9.47225806451613e-05, + "loss": 0.1557, + "step": 3773 + }, + { + "epoch": 0.060384, + "grad_norm": 0.73828125, + "learning_rate": 9.472096774193548e-05, + "loss": 0.2038, + "step": 3774 + }, + { + "epoch": 0.0604, + "grad_norm": 0.92578125, + "learning_rate": 9.471935483870968e-05, + "loss": 0.1638, + "step": 3775 + }, + { + "epoch": 0.060416, + "grad_norm": 0.85546875, + "learning_rate": 9.471774193548387e-05, + "loss": 0.213, + "step": 3776 + }, + { + "epoch": 0.060432, + "grad_norm": 0.84375, + "learning_rate": 9.471612903225807e-05, + "loss": 0.1739, + "step": 3777 + }, + { + "epoch": 0.060448, + "grad_norm": 0.921875, + "learning_rate": 9.471451612903226e-05, + "loss": 0.1602, + "step": 3778 + }, + { + "epoch": 0.060464, + "grad_norm": 0.90625, + "learning_rate": 9.471290322580646e-05, + "loss": 0.1918, + "step": 3779 + }, + { + "epoch": 0.06048, + "grad_norm": 0.96484375, + "learning_rate": 9.471129032258065e-05, + "loss": 0.1528, + "step": 3780 + }, + { + "epoch": 0.060496, + "grad_norm": 1.28125, + "learning_rate": 9.470967741935484e-05, + "loss": 0.1834, + "step": 3781 + }, + { + "epoch": 0.060512, + "grad_norm": 0.734375, + "learning_rate": 9.470806451612904e-05, + "loss": 0.2108, + "step": 3782 + }, + { + "epoch": 0.060528, + "grad_norm": 1.4609375, + "learning_rate": 9.470645161290324e-05, + "loss": 0.1869, + "step": 3783 + }, + { + "epoch": 0.060544, + "grad_norm": 0.83203125, + "learning_rate": 9.470483870967743e-05, + "loss": 0.1419, + "step": 3784 + }, + { + "epoch": 0.06056, + "grad_norm": 0.9140625, + "learning_rate": 9.470322580645161e-05, + "loss": 0.1534, + "step": 3785 + }, + { + "epoch": 0.060576, + "grad_norm": 0.71484375, + "learning_rate": 9.470161290322581e-05, + "loss": 0.1566, + "step": 3786 + }, + { + "epoch": 0.060592, + "grad_norm": 1.2890625, + "learning_rate": 9.47e-05, + "loss": 0.1606, + "step": 3787 + }, + { + "epoch": 0.060608, + "grad_norm": 1.1953125, + "learning_rate": 9.46983870967742e-05, + "loss": 0.211, + "step": 3788 + }, + { + "epoch": 0.060624, + "grad_norm": 1.2109375, + "learning_rate": 9.469677419354838e-05, + "loss": 0.2087, + "step": 3789 + }, + { + "epoch": 0.06064, + "grad_norm": 1.71875, + "learning_rate": 9.469516129032258e-05, + "loss": 0.15, + "step": 3790 + }, + { + "epoch": 0.060656, + "grad_norm": 1.703125, + "learning_rate": 9.469354838709677e-05, + "loss": 0.2135, + "step": 3791 + }, + { + "epoch": 0.060672, + "grad_norm": 1.046875, + "learning_rate": 9.469193548387097e-05, + "loss": 0.1513, + "step": 3792 + }, + { + "epoch": 0.060688, + "grad_norm": 1.1171875, + "learning_rate": 9.469032258064517e-05, + "loss": 0.2174, + "step": 3793 + }, + { + "epoch": 0.060704, + "grad_norm": 0.84765625, + "learning_rate": 9.468870967741937e-05, + "loss": 0.1804, + "step": 3794 + }, + { + "epoch": 0.06072, + "grad_norm": 0.8046875, + "learning_rate": 9.468709677419355e-05, + "loss": 0.1805, + "step": 3795 + }, + { + "epoch": 0.060736, + "grad_norm": 1.015625, + "learning_rate": 9.468548387096775e-05, + "loss": 0.1664, + "step": 3796 + }, + { + "epoch": 0.060752, + "grad_norm": 0.90234375, + "learning_rate": 9.468387096774194e-05, + "loss": 0.1929, + "step": 3797 + }, + { + "epoch": 0.060768, + "grad_norm": 0.91015625, + "learning_rate": 9.468225806451614e-05, + "loss": 0.1549, + "step": 3798 + }, + { + "epoch": 0.060784, + "grad_norm": 0.6875, + "learning_rate": 9.468064516129033e-05, + "loss": 0.1976, + "step": 3799 + }, + { + "epoch": 0.0608, + "grad_norm": 1.0234375, + "learning_rate": 9.467903225806453e-05, + "loss": 0.1972, + "step": 3800 + }, + { + "epoch": 0.060816, + "grad_norm": 0.95703125, + "learning_rate": 9.467741935483871e-05, + "loss": 0.1828, + "step": 3801 + }, + { + "epoch": 0.060832, + "grad_norm": 0.625, + "learning_rate": 9.46758064516129e-05, + "loss": 0.1717, + "step": 3802 + }, + { + "epoch": 0.060848, + "grad_norm": 1.125, + "learning_rate": 9.46741935483871e-05, + "loss": 0.1977, + "step": 3803 + }, + { + "epoch": 0.060864, + "grad_norm": 0.8984375, + "learning_rate": 9.46725806451613e-05, + "loss": 0.1643, + "step": 3804 + }, + { + "epoch": 0.06088, + "grad_norm": 1.328125, + "learning_rate": 9.46709677419355e-05, + "loss": 0.1552, + "step": 3805 + }, + { + "epoch": 0.060896, + "grad_norm": 1.1015625, + "learning_rate": 9.466935483870968e-05, + "loss": 0.1979, + "step": 3806 + }, + { + "epoch": 0.060912, + "grad_norm": 0.81640625, + "learning_rate": 9.466774193548388e-05, + "loss": 0.189, + "step": 3807 + }, + { + "epoch": 0.060928, + "grad_norm": 0.90625, + "learning_rate": 9.466612903225807e-05, + "loss": 0.1729, + "step": 3808 + }, + { + "epoch": 0.060944, + "grad_norm": 0.6953125, + "learning_rate": 9.466451612903227e-05, + "loss": 0.1794, + "step": 3809 + }, + { + "epoch": 0.06096, + "grad_norm": 0.7265625, + "learning_rate": 9.466290322580645e-05, + "loss": 0.2047, + "step": 3810 + }, + { + "epoch": 0.060976, + "grad_norm": 0.66796875, + "learning_rate": 9.466129032258065e-05, + "loss": 0.1672, + "step": 3811 + }, + { + "epoch": 0.060992, + "grad_norm": 0.8125, + "learning_rate": 9.465967741935484e-05, + "loss": 0.2125, + "step": 3812 + }, + { + "epoch": 0.061008, + "grad_norm": 0.87890625, + "learning_rate": 9.465806451612904e-05, + "loss": 0.1742, + "step": 3813 + }, + { + "epoch": 0.061024, + "grad_norm": 1.0703125, + "learning_rate": 9.465645161290323e-05, + "loss": 0.2197, + "step": 3814 + }, + { + "epoch": 0.06104, + "grad_norm": 0.890625, + "learning_rate": 9.465483870967742e-05, + "loss": 0.1546, + "step": 3815 + }, + { + "epoch": 0.061056, + "grad_norm": 0.84765625, + "learning_rate": 9.465322580645161e-05, + "loss": 0.2141, + "step": 3816 + }, + { + "epoch": 0.061072, + "grad_norm": 0.56640625, + "learning_rate": 9.465161290322581e-05, + "loss": 0.1586, + "step": 3817 + }, + { + "epoch": 0.061088, + "grad_norm": 0.80859375, + "learning_rate": 9.465000000000001e-05, + "loss": 0.1949, + "step": 3818 + }, + { + "epoch": 0.061104, + "grad_norm": 0.75390625, + "learning_rate": 9.46483870967742e-05, + "loss": 0.1948, + "step": 3819 + }, + { + "epoch": 0.06112, + "grad_norm": 0.76171875, + "learning_rate": 9.46467741935484e-05, + "loss": 0.2057, + "step": 3820 + }, + { + "epoch": 0.061136, + "grad_norm": 0.94140625, + "learning_rate": 9.464516129032258e-05, + "loss": 0.2197, + "step": 3821 + }, + { + "epoch": 0.061152, + "grad_norm": 0.80859375, + "learning_rate": 9.464354838709678e-05, + "loss": 0.1802, + "step": 3822 + }, + { + "epoch": 0.061168, + "grad_norm": 0.87890625, + "learning_rate": 9.464193548387097e-05, + "loss": 0.2015, + "step": 3823 + }, + { + "epoch": 0.061184, + "grad_norm": 0.55859375, + "learning_rate": 9.464032258064517e-05, + "loss": 0.1654, + "step": 3824 + }, + { + "epoch": 0.0612, + "grad_norm": 0.490234375, + "learning_rate": 9.463870967741935e-05, + "loss": 0.1637, + "step": 3825 + }, + { + "epoch": 0.061216, + "grad_norm": 1.0234375, + "learning_rate": 9.463709677419355e-05, + "loss": 0.187, + "step": 3826 + }, + { + "epoch": 0.061232, + "grad_norm": 0.58984375, + "learning_rate": 9.463548387096774e-05, + "loss": 0.1801, + "step": 3827 + }, + { + "epoch": 0.061248, + "grad_norm": 0.6171875, + "learning_rate": 9.463387096774194e-05, + "loss": 0.1651, + "step": 3828 + }, + { + "epoch": 0.061264, + "grad_norm": 0.796875, + "learning_rate": 9.463225806451614e-05, + "loss": 0.1698, + "step": 3829 + }, + { + "epoch": 0.06128, + "grad_norm": 0.7421875, + "learning_rate": 9.463064516129034e-05, + "loss": 0.1985, + "step": 3830 + }, + { + "epoch": 0.061296, + "grad_norm": 0.90234375, + "learning_rate": 9.462903225806452e-05, + "loss": 0.1459, + "step": 3831 + }, + { + "epoch": 0.061312, + "grad_norm": 0.75, + "learning_rate": 9.462741935483871e-05, + "loss": 0.1785, + "step": 3832 + }, + { + "epoch": 0.061328, + "grad_norm": 0.68359375, + "learning_rate": 9.462580645161291e-05, + "loss": 0.185, + "step": 3833 + }, + { + "epoch": 0.061344, + "grad_norm": 0.67578125, + "learning_rate": 9.46241935483871e-05, + "loss": 0.1851, + "step": 3834 + }, + { + "epoch": 0.06136, + "grad_norm": 0.85546875, + "learning_rate": 9.46225806451613e-05, + "loss": 0.2044, + "step": 3835 + }, + { + "epoch": 0.061376, + "grad_norm": 0.9375, + "learning_rate": 9.462096774193548e-05, + "loss": 0.202, + "step": 3836 + }, + { + "epoch": 0.061392, + "grad_norm": 1.21875, + "learning_rate": 9.461935483870968e-05, + "loss": 0.1943, + "step": 3837 + }, + { + "epoch": 0.061408, + "grad_norm": 0.609375, + "learning_rate": 9.461774193548387e-05, + "loss": 0.184, + "step": 3838 + }, + { + "epoch": 0.061424, + "grad_norm": 0.79296875, + "learning_rate": 9.461612903225807e-05, + "loss": 0.1909, + "step": 3839 + }, + { + "epoch": 0.06144, + "grad_norm": 0.65625, + "learning_rate": 9.461451612903227e-05, + "loss": 0.1366, + "step": 3840 + }, + { + "epoch": 0.061456, + "grad_norm": 0.62109375, + "learning_rate": 9.461290322580647e-05, + "loss": 0.1902, + "step": 3841 + }, + { + "epoch": 0.061472, + "grad_norm": 1.0234375, + "learning_rate": 9.461129032258065e-05, + "loss": 0.1989, + "step": 3842 + }, + { + "epoch": 0.061488, + "grad_norm": 0.71484375, + "learning_rate": 9.460967741935485e-05, + "loss": 0.1845, + "step": 3843 + }, + { + "epoch": 0.061504, + "grad_norm": 0.6328125, + "learning_rate": 9.460806451612904e-05, + "loss": 0.1727, + "step": 3844 + }, + { + "epoch": 0.06152, + "grad_norm": 0.78515625, + "learning_rate": 9.460645161290324e-05, + "loss": 0.1995, + "step": 3845 + }, + { + "epoch": 0.061536, + "grad_norm": 0.9140625, + "learning_rate": 9.460483870967742e-05, + "loss": 0.1641, + "step": 3846 + }, + { + "epoch": 0.061552, + "grad_norm": 0.6796875, + "learning_rate": 9.460322580645161e-05, + "loss": 0.2055, + "step": 3847 + }, + { + "epoch": 0.061568, + "grad_norm": 0.6875, + "learning_rate": 9.460161290322581e-05, + "loss": 0.1754, + "step": 3848 + }, + { + "epoch": 0.061584, + "grad_norm": 0.890625, + "learning_rate": 9.46e-05, + "loss": 0.2149, + "step": 3849 + }, + { + "epoch": 0.0616, + "grad_norm": 0.8203125, + "learning_rate": 9.45983870967742e-05, + "loss": 0.1532, + "step": 3850 + }, + { + "epoch": 0.061616, + "grad_norm": 1.015625, + "learning_rate": 9.459677419354838e-05, + "loss": 0.2005, + "step": 3851 + }, + { + "epoch": 0.061632, + "grad_norm": 0.8125, + "learning_rate": 9.459516129032258e-05, + "loss": 0.2126, + "step": 3852 + }, + { + "epoch": 0.061648, + "grad_norm": 0.8203125, + "learning_rate": 9.459354838709678e-05, + "loss": 0.1978, + "step": 3853 + }, + { + "epoch": 0.061664, + "grad_norm": 0.8203125, + "learning_rate": 9.459193548387098e-05, + "loss": 0.1604, + "step": 3854 + }, + { + "epoch": 0.06168, + "grad_norm": 1.078125, + "learning_rate": 9.459032258064517e-05, + "loss": 0.1839, + "step": 3855 + }, + { + "epoch": 0.061696, + "grad_norm": 0.77734375, + "learning_rate": 9.458870967741937e-05, + "loss": 0.1709, + "step": 3856 + }, + { + "epoch": 0.061712, + "grad_norm": 0.60546875, + "learning_rate": 9.458709677419355e-05, + "loss": 0.1413, + "step": 3857 + }, + { + "epoch": 0.061728, + "grad_norm": 0.84375, + "learning_rate": 9.458548387096775e-05, + "loss": 0.1996, + "step": 3858 + }, + { + "epoch": 0.061744, + "grad_norm": 1.296875, + "learning_rate": 9.458387096774194e-05, + "loss": 0.1956, + "step": 3859 + }, + { + "epoch": 0.06176, + "grad_norm": 1.8125, + "learning_rate": 9.458225806451614e-05, + "loss": 0.1754, + "step": 3860 + }, + { + "epoch": 0.061776, + "grad_norm": 0.7265625, + "learning_rate": 9.458064516129032e-05, + "loss": 0.1856, + "step": 3861 + }, + { + "epoch": 0.061792, + "grad_norm": 1.2578125, + "learning_rate": 9.457903225806452e-05, + "loss": 0.1954, + "step": 3862 + }, + { + "epoch": 0.061808, + "grad_norm": 1.140625, + "learning_rate": 9.457741935483871e-05, + "loss": 0.2055, + "step": 3863 + }, + { + "epoch": 0.061824, + "grad_norm": 0.69921875, + "learning_rate": 9.457580645161291e-05, + "loss": 0.2162, + "step": 3864 + }, + { + "epoch": 0.06184, + "grad_norm": 0.78125, + "learning_rate": 9.457419354838711e-05, + "loss": 0.2104, + "step": 3865 + }, + { + "epoch": 0.061856, + "grad_norm": 0.69921875, + "learning_rate": 9.45725806451613e-05, + "loss": 0.1464, + "step": 3866 + }, + { + "epoch": 0.061872, + "grad_norm": 0.5859375, + "learning_rate": 9.45709677419355e-05, + "loss": 0.1367, + "step": 3867 + }, + { + "epoch": 0.061888, + "grad_norm": 0.66796875, + "learning_rate": 9.456935483870968e-05, + "loss": 0.1962, + "step": 3868 + }, + { + "epoch": 0.061904, + "grad_norm": 0.6640625, + "learning_rate": 9.456774193548388e-05, + "loss": 0.1582, + "step": 3869 + }, + { + "epoch": 0.06192, + "grad_norm": 1.1015625, + "learning_rate": 9.456612903225807e-05, + "loss": 0.1749, + "step": 3870 + }, + { + "epoch": 0.061936, + "grad_norm": 0.8203125, + "learning_rate": 9.456451612903227e-05, + "loss": 0.1731, + "step": 3871 + }, + { + "epoch": 0.061952, + "grad_norm": 1.2109375, + "learning_rate": 9.456290322580645e-05, + "loss": 0.1595, + "step": 3872 + }, + { + "epoch": 0.061968, + "grad_norm": 0.71484375, + "learning_rate": 9.456129032258065e-05, + "loss": 0.1415, + "step": 3873 + }, + { + "epoch": 0.061984, + "grad_norm": 1.0546875, + "learning_rate": 9.455967741935484e-05, + "loss": 0.2027, + "step": 3874 + }, + { + "epoch": 0.062, + "grad_norm": 0.78125, + "learning_rate": 9.455806451612904e-05, + "loss": 0.184, + "step": 3875 + }, + { + "epoch": 0.062016, + "grad_norm": 0.8515625, + "learning_rate": 9.455645161290322e-05, + "loss": 0.1793, + "step": 3876 + }, + { + "epoch": 0.062032, + "grad_norm": 0.79296875, + "learning_rate": 9.455483870967742e-05, + "loss": 0.166, + "step": 3877 + }, + { + "epoch": 0.062048, + "grad_norm": 1.3046875, + "learning_rate": 9.455322580645162e-05, + "loss": 0.199, + "step": 3878 + }, + { + "epoch": 0.062064, + "grad_norm": 0.79296875, + "learning_rate": 9.455161290322581e-05, + "loss": 0.2266, + "step": 3879 + }, + { + "epoch": 0.06208, + "grad_norm": 0.76171875, + "learning_rate": 9.455000000000001e-05, + "loss": 0.1626, + "step": 3880 + }, + { + "epoch": 0.062096, + "grad_norm": 0.875, + "learning_rate": 9.45483870967742e-05, + "loss": 0.1765, + "step": 3881 + }, + { + "epoch": 0.062112, + "grad_norm": 0.94140625, + "learning_rate": 9.45467741935484e-05, + "loss": 0.1769, + "step": 3882 + }, + { + "epoch": 0.062128, + "grad_norm": 0.98046875, + "learning_rate": 9.454516129032258e-05, + "loss": 0.1735, + "step": 3883 + }, + { + "epoch": 0.062144, + "grad_norm": 0.61328125, + "learning_rate": 9.454354838709678e-05, + "loss": 0.1836, + "step": 3884 + }, + { + "epoch": 0.06216, + "grad_norm": 1.0390625, + "learning_rate": 9.454193548387097e-05, + "loss": 0.2073, + "step": 3885 + }, + { + "epoch": 0.062176, + "grad_norm": 0.9375, + "learning_rate": 9.454032258064516e-05, + "loss": 0.1729, + "step": 3886 + }, + { + "epoch": 0.062192, + "grad_norm": 0.95703125, + "learning_rate": 9.453870967741935e-05, + "loss": 0.1666, + "step": 3887 + }, + { + "epoch": 0.062208, + "grad_norm": 0.81640625, + "learning_rate": 9.453709677419355e-05, + "loss": 0.1835, + "step": 3888 + }, + { + "epoch": 0.062224, + "grad_norm": 0.9375, + "learning_rate": 9.453548387096775e-05, + "loss": 0.192, + "step": 3889 + }, + { + "epoch": 0.06224, + "grad_norm": 0.67578125, + "learning_rate": 9.453387096774195e-05, + "loss": 0.1539, + "step": 3890 + }, + { + "epoch": 0.062256, + "grad_norm": 0.7421875, + "learning_rate": 9.453225806451614e-05, + "loss": 0.1693, + "step": 3891 + }, + { + "epoch": 0.062272, + "grad_norm": 0.5703125, + "learning_rate": 9.453064516129034e-05, + "loss": 0.1609, + "step": 3892 + }, + { + "epoch": 0.062288, + "grad_norm": 0.72265625, + "learning_rate": 9.452903225806452e-05, + "loss": 0.1894, + "step": 3893 + }, + { + "epoch": 0.062304, + "grad_norm": 0.98828125, + "learning_rate": 9.452741935483871e-05, + "loss": 0.2074, + "step": 3894 + }, + { + "epoch": 0.06232, + "grad_norm": 0.6171875, + "learning_rate": 9.452580645161291e-05, + "loss": 0.1637, + "step": 3895 + }, + { + "epoch": 0.062336, + "grad_norm": 1.3359375, + "learning_rate": 9.45241935483871e-05, + "loss": 0.2201, + "step": 3896 + }, + { + "epoch": 0.062352, + "grad_norm": 0.8203125, + "learning_rate": 9.452258064516129e-05, + "loss": 0.15, + "step": 3897 + }, + { + "epoch": 0.062368, + "grad_norm": 0.796875, + "learning_rate": 9.452096774193548e-05, + "loss": 0.1619, + "step": 3898 + }, + { + "epoch": 0.062384, + "grad_norm": 0.7890625, + "learning_rate": 9.451935483870968e-05, + "loss": 0.2026, + "step": 3899 + }, + { + "epoch": 0.0624, + "grad_norm": 1.390625, + "learning_rate": 9.451774193548388e-05, + "loss": 0.1668, + "step": 3900 + }, + { + "epoch": 0.062416, + "grad_norm": 0.953125, + "learning_rate": 9.451612903225808e-05, + "loss": 0.1575, + "step": 3901 + }, + { + "epoch": 0.062432, + "grad_norm": 0.93359375, + "learning_rate": 9.451451612903226e-05, + "loss": 0.1955, + "step": 3902 + }, + { + "epoch": 0.062448, + "grad_norm": 0.9921875, + "learning_rate": 9.451290322580646e-05, + "loss": 0.1821, + "step": 3903 + }, + { + "epoch": 0.062464, + "grad_norm": 1.8515625, + "learning_rate": 9.451129032258065e-05, + "loss": 0.2086, + "step": 3904 + }, + { + "epoch": 0.06248, + "grad_norm": 1.8046875, + "learning_rate": 9.450967741935485e-05, + "loss": 0.1961, + "step": 3905 + }, + { + "epoch": 0.062496, + "grad_norm": 0.9765625, + "learning_rate": 9.450806451612904e-05, + "loss": 0.1829, + "step": 3906 + }, + { + "epoch": 0.062512, + "grad_norm": 1.1171875, + "learning_rate": 9.450645161290324e-05, + "loss": 0.219, + "step": 3907 + }, + { + "epoch": 0.062528, + "grad_norm": 1.1484375, + "learning_rate": 9.450483870967742e-05, + "loss": 0.1764, + "step": 3908 + }, + { + "epoch": 0.062544, + "grad_norm": 1.0390625, + "learning_rate": 9.450322580645162e-05, + "loss": 0.194, + "step": 3909 + }, + { + "epoch": 0.06256, + "grad_norm": 1.09375, + "learning_rate": 9.450161290322581e-05, + "loss": 0.19, + "step": 3910 + }, + { + "epoch": 0.062576, + "grad_norm": 0.91796875, + "learning_rate": 9.449999999999999e-05, + "loss": 0.157, + "step": 3911 + }, + { + "epoch": 0.062592, + "grad_norm": 0.68359375, + "learning_rate": 9.449838709677419e-05, + "loss": 0.1958, + "step": 3912 + }, + { + "epoch": 0.062608, + "grad_norm": 1.359375, + "learning_rate": 9.449677419354839e-05, + "loss": 0.1801, + "step": 3913 + }, + { + "epoch": 0.062624, + "grad_norm": 0.64453125, + "learning_rate": 9.449516129032259e-05, + "loss": 0.1672, + "step": 3914 + }, + { + "epoch": 0.06264, + "grad_norm": 1.015625, + "learning_rate": 9.449354838709678e-05, + "loss": 0.2635, + "step": 3915 + }, + { + "epoch": 0.062656, + "grad_norm": 1.203125, + "learning_rate": 9.449193548387098e-05, + "loss": 0.1669, + "step": 3916 + }, + { + "epoch": 0.062672, + "grad_norm": 0.69140625, + "learning_rate": 9.449032258064516e-05, + "loss": 0.178, + "step": 3917 + }, + { + "epoch": 0.062688, + "grad_norm": 0.8203125, + "learning_rate": 9.448870967741936e-05, + "loss": 0.179, + "step": 3918 + }, + { + "epoch": 0.062704, + "grad_norm": 0.8671875, + "learning_rate": 9.448709677419355e-05, + "loss": 0.1718, + "step": 3919 + }, + { + "epoch": 0.06272, + "grad_norm": 0.6171875, + "learning_rate": 9.448548387096775e-05, + "loss": 0.1762, + "step": 3920 + }, + { + "epoch": 0.062736, + "grad_norm": 0.87109375, + "learning_rate": 9.448387096774194e-05, + "loss": 0.2465, + "step": 3921 + }, + { + "epoch": 0.062752, + "grad_norm": 0.68359375, + "learning_rate": 9.448225806451613e-05, + "loss": 0.1632, + "step": 3922 + }, + { + "epoch": 0.062768, + "grad_norm": 1.0, + "learning_rate": 9.448064516129032e-05, + "loss": 0.2349, + "step": 3923 + }, + { + "epoch": 0.062784, + "grad_norm": 0.58984375, + "learning_rate": 9.447903225806452e-05, + "loss": 0.1565, + "step": 3924 + }, + { + "epoch": 0.0628, + "grad_norm": 0.8984375, + "learning_rate": 9.447741935483872e-05, + "loss": 0.1964, + "step": 3925 + }, + { + "epoch": 0.062816, + "grad_norm": 2.296875, + "learning_rate": 9.44758064516129e-05, + "loss": 0.2098, + "step": 3926 + }, + { + "epoch": 0.062832, + "grad_norm": 0.62890625, + "learning_rate": 9.44741935483871e-05, + "loss": 0.1871, + "step": 3927 + }, + { + "epoch": 0.062848, + "grad_norm": 1.1640625, + "learning_rate": 9.447258064516129e-05, + "loss": 0.1429, + "step": 3928 + }, + { + "epoch": 0.062864, + "grad_norm": 0.75390625, + "learning_rate": 9.447096774193549e-05, + "loss": 0.1684, + "step": 3929 + }, + { + "epoch": 0.06288, + "grad_norm": 0.890625, + "learning_rate": 9.446935483870968e-05, + "loss": 0.1731, + "step": 3930 + }, + { + "epoch": 0.062896, + "grad_norm": 0.5703125, + "learning_rate": 9.446774193548388e-05, + "loss": 0.1674, + "step": 3931 + }, + { + "epoch": 0.062912, + "grad_norm": 1.5, + "learning_rate": 9.446612903225806e-05, + "loss": 0.209, + "step": 3932 + }, + { + "epoch": 0.062928, + "grad_norm": 1.046875, + "learning_rate": 9.446451612903226e-05, + "loss": 0.2116, + "step": 3933 + }, + { + "epoch": 0.062944, + "grad_norm": 1.015625, + "learning_rate": 9.446290322580645e-05, + "loss": 0.1748, + "step": 3934 + }, + { + "epoch": 0.06296, + "grad_norm": 0.9140625, + "learning_rate": 9.446129032258065e-05, + "loss": 0.1756, + "step": 3935 + }, + { + "epoch": 0.062976, + "grad_norm": 0.96484375, + "learning_rate": 9.445967741935485e-05, + "loss": 0.1867, + "step": 3936 + }, + { + "epoch": 0.062992, + "grad_norm": 0.8203125, + "learning_rate": 9.445806451612905e-05, + "loss": 0.2114, + "step": 3937 + }, + { + "epoch": 0.063008, + "grad_norm": 0.9140625, + "learning_rate": 9.445645161290323e-05, + "loss": 0.1768, + "step": 3938 + }, + { + "epoch": 0.063024, + "grad_norm": 1.203125, + "learning_rate": 9.445483870967743e-05, + "loss": 0.1756, + "step": 3939 + }, + { + "epoch": 0.06304, + "grad_norm": 0.8984375, + "learning_rate": 9.445322580645162e-05, + "loss": 0.185, + "step": 3940 + }, + { + "epoch": 0.063056, + "grad_norm": 0.7109375, + "learning_rate": 9.44516129032258e-05, + "loss": 0.2184, + "step": 3941 + }, + { + "epoch": 0.063072, + "grad_norm": 0.7578125, + "learning_rate": 9.445e-05, + "loss": 0.2056, + "step": 3942 + }, + { + "epoch": 0.063088, + "grad_norm": 1.3046875, + "learning_rate": 9.444838709677419e-05, + "loss": 0.1915, + "step": 3943 + }, + { + "epoch": 0.063104, + "grad_norm": 1.3359375, + "learning_rate": 9.444677419354839e-05, + "loss": 0.207, + "step": 3944 + }, + { + "epoch": 0.06312, + "grad_norm": 0.94921875, + "learning_rate": 9.444516129032258e-05, + "loss": 0.2125, + "step": 3945 + }, + { + "epoch": 0.063136, + "grad_norm": 0.73828125, + "learning_rate": 9.444354838709678e-05, + "loss": 0.1847, + "step": 3946 + }, + { + "epoch": 0.063152, + "grad_norm": 0.85546875, + "learning_rate": 9.444193548387096e-05, + "loss": 0.1813, + "step": 3947 + }, + { + "epoch": 0.063168, + "grad_norm": 1.0859375, + "learning_rate": 9.444032258064516e-05, + "loss": 0.1555, + "step": 3948 + }, + { + "epoch": 0.063184, + "grad_norm": 1.0859375, + "learning_rate": 9.443870967741936e-05, + "loss": 0.1578, + "step": 3949 + }, + { + "epoch": 0.0632, + "grad_norm": 0.578125, + "learning_rate": 9.443709677419356e-05, + "loss": 0.1647, + "step": 3950 + }, + { + "epoch": 0.063216, + "grad_norm": 0.8828125, + "learning_rate": 9.443548387096775e-05, + "loss": 0.1717, + "step": 3951 + }, + { + "epoch": 0.063232, + "grad_norm": 0.72265625, + "learning_rate": 9.443387096774195e-05, + "loss": 0.1801, + "step": 3952 + }, + { + "epoch": 0.063248, + "grad_norm": 0.65234375, + "learning_rate": 9.443225806451613e-05, + "loss": 0.1967, + "step": 3953 + }, + { + "epoch": 0.063264, + "grad_norm": 0.79296875, + "learning_rate": 9.443064516129033e-05, + "loss": 0.1829, + "step": 3954 + }, + { + "epoch": 0.06328, + "grad_norm": 0.8671875, + "learning_rate": 9.442903225806452e-05, + "loss": 0.2066, + "step": 3955 + }, + { + "epoch": 0.063296, + "grad_norm": 0.9453125, + "learning_rate": 9.44274193548387e-05, + "loss": 0.1892, + "step": 3956 + }, + { + "epoch": 0.063312, + "grad_norm": 1.0703125, + "learning_rate": 9.44258064516129e-05, + "loss": 0.2113, + "step": 3957 + }, + { + "epoch": 0.063328, + "grad_norm": 0.875, + "learning_rate": 9.442419354838709e-05, + "loss": 0.1544, + "step": 3958 + }, + { + "epoch": 0.063344, + "grad_norm": 1.1953125, + "learning_rate": 9.442258064516129e-05, + "loss": 0.2131, + "step": 3959 + }, + { + "epoch": 0.06336, + "grad_norm": 0.65234375, + "learning_rate": 9.442096774193549e-05, + "loss": 0.1406, + "step": 3960 + }, + { + "epoch": 0.063376, + "grad_norm": 0.80078125, + "learning_rate": 9.441935483870969e-05, + "loss": 0.1607, + "step": 3961 + }, + { + "epoch": 0.063392, + "grad_norm": 0.765625, + "learning_rate": 9.441774193548388e-05, + "loss": 0.15, + "step": 3962 + }, + { + "epoch": 0.063408, + "grad_norm": 0.73828125, + "learning_rate": 9.441612903225808e-05, + "loss": 0.1678, + "step": 3963 + }, + { + "epoch": 0.063424, + "grad_norm": 0.76171875, + "learning_rate": 9.441451612903226e-05, + "loss": 0.186, + "step": 3964 + }, + { + "epoch": 0.06344, + "grad_norm": 0.9140625, + "learning_rate": 9.441290322580646e-05, + "loss": 0.1597, + "step": 3965 + }, + { + "epoch": 0.063456, + "grad_norm": 0.99609375, + "learning_rate": 9.441129032258065e-05, + "loss": 0.1671, + "step": 3966 + }, + { + "epoch": 0.063472, + "grad_norm": 1.2109375, + "learning_rate": 9.440967741935485e-05, + "loss": 0.1747, + "step": 3967 + }, + { + "epoch": 0.063488, + "grad_norm": 0.83984375, + "learning_rate": 9.440806451612903e-05, + "loss": 0.185, + "step": 3968 + }, + { + "epoch": 0.063504, + "grad_norm": 0.7578125, + "learning_rate": 9.440645161290323e-05, + "loss": 0.1872, + "step": 3969 + }, + { + "epoch": 0.06352, + "grad_norm": 1.0703125, + "learning_rate": 9.440483870967742e-05, + "loss": 0.2251, + "step": 3970 + }, + { + "epoch": 0.063536, + "grad_norm": 0.94140625, + "learning_rate": 9.440322580645162e-05, + "loss": 0.1898, + "step": 3971 + }, + { + "epoch": 0.063552, + "grad_norm": 1.2734375, + "learning_rate": 9.44016129032258e-05, + "loss": 0.1708, + "step": 3972 + }, + { + "epoch": 0.063568, + "grad_norm": 0.81640625, + "learning_rate": 9.44e-05, + "loss": 0.2076, + "step": 3973 + }, + { + "epoch": 0.063584, + "grad_norm": 1.375, + "learning_rate": 9.43983870967742e-05, + "loss": 0.2299, + "step": 3974 + }, + { + "epoch": 0.0636, + "grad_norm": 1.59375, + "learning_rate": 9.439677419354839e-05, + "loss": 0.1638, + "step": 3975 + }, + { + "epoch": 0.063616, + "grad_norm": 0.7109375, + "learning_rate": 9.439516129032259e-05, + "loss": 0.1345, + "step": 3976 + }, + { + "epoch": 0.063632, + "grad_norm": 1.1484375, + "learning_rate": 9.439354838709678e-05, + "loss": 0.1956, + "step": 3977 + }, + { + "epoch": 0.063648, + "grad_norm": 0.7421875, + "learning_rate": 9.439193548387098e-05, + "loss": 0.185, + "step": 3978 + }, + { + "epoch": 0.063664, + "grad_norm": 0.765625, + "learning_rate": 9.439032258064516e-05, + "loss": 0.198, + "step": 3979 + }, + { + "epoch": 0.06368, + "grad_norm": 1.578125, + "learning_rate": 9.438870967741936e-05, + "loss": 0.182, + "step": 3980 + }, + { + "epoch": 0.063696, + "grad_norm": 0.5859375, + "learning_rate": 9.438709677419355e-05, + "loss": 0.1874, + "step": 3981 + }, + { + "epoch": 0.063712, + "grad_norm": 0.82421875, + "learning_rate": 9.438548387096775e-05, + "loss": 0.2083, + "step": 3982 + }, + { + "epoch": 0.063728, + "grad_norm": 1.078125, + "learning_rate": 9.438387096774193e-05, + "loss": 0.2091, + "step": 3983 + }, + { + "epoch": 0.063744, + "grad_norm": 1.375, + "learning_rate": 9.438225806451613e-05, + "loss": 0.2094, + "step": 3984 + }, + { + "epoch": 0.06376, + "grad_norm": 0.68359375, + "learning_rate": 9.438064516129033e-05, + "loss": 0.1594, + "step": 3985 + }, + { + "epoch": 0.063776, + "grad_norm": 1.171875, + "learning_rate": 9.437903225806453e-05, + "loss": 0.1893, + "step": 3986 + }, + { + "epoch": 0.063792, + "grad_norm": 0.7421875, + "learning_rate": 9.437741935483872e-05, + "loss": 0.1633, + "step": 3987 + }, + { + "epoch": 0.063808, + "grad_norm": 0.77734375, + "learning_rate": 9.43758064516129e-05, + "loss": 0.2005, + "step": 3988 + }, + { + "epoch": 0.063824, + "grad_norm": 0.95703125, + "learning_rate": 9.43741935483871e-05, + "loss": 0.1923, + "step": 3989 + }, + { + "epoch": 0.06384, + "grad_norm": 0.7265625, + "learning_rate": 9.437258064516129e-05, + "loss": 0.1761, + "step": 3990 + }, + { + "epoch": 0.063856, + "grad_norm": 0.90625, + "learning_rate": 9.437096774193549e-05, + "loss": 0.1851, + "step": 3991 + }, + { + "epoch": 0.063872, + "grad_norm": 0.6171875, + "learning_rate": 9.436935483870968e-05, + "loss": 0.188, + "step": 3992 + }, + { + "epoch": 0.063888, + "grad_norm": 1.015625, + "learning_rate": 9.436774193548387e-05, + "loss": 0.176, + "step": 3993 + }, + { + "epoch": 0.063904, + "grad_norm": 0.90234375, + "learning_rate": 9.436612903225806e-05, + "loss": 0.1928, + "step": 3994 + }, + { + "epoch": 0.06392, + "grad_norm": 1.4296875, + "learning_rate": 9.436451612903226e-05, + "loss": 0.2359, + "step": 3995 + }, + { + "epoch": 0.063936, + "grad_norm": 0.6015625, + "learning_rate": 9.436290322580646e-05, + "loss": 0.1431, + "step": 3996 + }, + { + "epoch": 0.063952, + "grad_norm": 1.5703125, + "learning_rate": 9.436129032258066e-05, + "loss": 0.2064, + "step": 3997 + }, + { + "epoch": 0.063968, + "grad_norm": 1.3671875, + "learning_rate": 9.435967741935485e-05, + "loss": 0.2037, + "step": 3998 + }, + { + "epoch": 0.063984, + "grad_norm": 0.73828125, + "learning_rate": 9.435806451612905e-05, + "loss": 0.1969, + "step": 3999 + }, + { + "epoch": 0.064, + "grad_norm": 0.734375, + "learning_rate": 9.435645161290323e-05, + "loss": 0.1644, + "step": 4000 + }, + { + "epoch": 0.064016, + "grad_norm": 1.328125, + "learning_rate": 9.435483870967743e-05, + "loss": 0.2575, + "step": 4001 + }, + { + "epoch": 0.064032, + "grad_norm": 0.890625, + "learning_rate": 9.435322580645162e-05, + "loss": 0.1753, + "step": 4002 + }, + { + "epoch": 0.064048, + "grad_norm": 0.69921875, + "learning_rate": 9.43516129032258e-05, + "loss": 0.1959, + "step": 4003 + }, + { + "epoch": 0.064064, + "grad_norm": 0.7265625, + "learning_rate": 9.435e-05, + "loss": 0.1613, + "step": 4004 + }, + { + "epoch": 0.06408, + "grad_norm": 0.734375, + "learning_rate": 9.434838709677419e-05, + "loss": 0.206, + "step": 4005 + }, + { + "epoch": 0.064096, + "grad_norm": 0.921875, + "learning_rate": 9.434677419354839e-05, + "loss": 0.2054, + "step": 4006 + }, + { + "epoch": 0.064112, + "grad_norm": 0.78515625, + "learning_rate": 9.434516129032257e-05, + "loss": 0.1724, + "step": 4007 + }, + { + "epoch": 0.064128, + "grad_norm": 0.91015625, + "learning_rate": 9.434354838709677e-05, + "loss": 0.1959, + "step": 4008 + }, + { + "epoch": 0.064144, + "grad_norm": 0.6875, + "learning_rate": 9.434193548387097e-05, + "loss": 0.1526, + "step": 4009 + }, + { + "epoch": 0.06416, + "grad_norm": 1.1875, + "learning_rate": 9.434032258064517e-05, + "loss": 0.1479, + "step": 4010 + }, + { + "epoch": 0.064176, + "grad_norm": 0.62890625, + "learning_rate": 9.433870967741936e-05, + "loss": 0.1588, + "step": 4011 + }, + { + "epoch": 0.064192, + "grad_norm": 0.984375, + "learning_rate": 9.433709677419356e-05, + "loss": 0.1616, + "step": 4012 + }, + { + "epoch": 0.064208, + "grad_norm": 0.7578125, + "learning_rate": 9.433548387096775e-05, + "loss": 0.1768, + "step": 4013 + }, + { + "epoch": 0.064224, + "grad_norm": 1.0234375, + "learning_rate": 9.433387096774195e-05, + "loss": 0.2293, + "step": 4014 + }, + { + "epoch": 0.06424, + "grad_norm": 1.015625, + "learning_rate": 9.433225806451613e-05, + "loss": 0.2168, + "step": 4015 + }, + { + "epoch": 0.064256, + "grad_norm": 1.1875, + "learning_rate": 9.433064516129033e-05, + "loss": 0.1768, + "step": 4016 + }, + { + "epoch": 0.064272, + "grad_norm": 0.6328125, + "learning_rate": 9.432903225806452e-05, + "loss": 0.1752, + "step": 4017 + }, + { + "epoch": 0.064288, + "grad_norm": 0.640625, + "learning_rate": 9.432741935483872e-05, + "loss": 0.1727, + "step": 4018 + }, + { + "epoch": 0.064304, + "grad_norm": 0.79296875, + "learning_rate": 9.43258064516129e-05, + "loss": 0.2083, + "step": 4019 + }, + { + "epoch": 0.06432, + "grad_norm": 0.85546875, + "learning_rate": 9.43241935483871e-05, + "loss": 0.2386, + "step": 4020 + }, + { + "epoch": 0.064336, + "grad_norm": 0.98046875, + "learning_rate": 9.43225806451613e-05, + "loss": 0.2765, + "step": 4021 + }, + { + "epoch": 0.064352, + "grad_norm": 1.015625, + "learning_rate": 9.432096774193549e-05, + "loss": 0.171, + "step": 4022 + }, + { + "epoch": 0.064368, + "grad_norm": 0.80859375, + "learning_rate": 9.431935483870969e-05, + "loss": 0.1786, + "step": 4023 + }, + { + "epoch": 0.064384, + "grad_norm": 0.8125, + "learning_rate": 9.431774193548387e-05, + "loss": 0.1654, + "step": 4024 + }, + { + "epoch": 0.0644, + "grad_norm": 0.9375, + "learning_rate": 9.431612903225807e-05, + "loss": 0.2044, + "step": 4025 + }, + { + "epoch": 0.064416, + "grad_norm": 0.828125, + "learning_rate": 9.431451612903226e-05, + "loss": 0.2164, + "step": 4026 + }, + { + "epoch": 0.064432, + "grad_norm": 1.0859375, + "learning_rate": 9.431290322580646e-05, + "loss": 0.2256, + "step": 4027 + }, + { + "epoch": 0.064448, + "grad_norm": 0.76953125, + "learning_rate": 9.431129032258065e-05, + "loss": 0.1734, + "step": 4028 + }, + { + "epoch": 0.064464, + "grad_norm": 0.63671875, + "learning_rate": 9.430967741935484e-05, + "loss": 0.1774, + "step": 4029 + }, + { + "epoch": 0.06448, + "grad_norm": 0.6796875, + "learning_rate": 9.430806451612903e-05, + "loss": 0.1608, + "step": 4030 + }, + { + "epoch": 0.064496, + "grad_norm": 1.421875, + "learning_rate": 9.430645161290323e-05, + "loss": 0.1579, + "step": 4031 + }, + { + "epoch": 0.064512, + "grad_norm": 0.92578125, + "learning_rate": 9.430483870967743e-05, + "loss": 0.1942, + "step": 4032 + }, + { + "epoch": 0.064528, + "grad_norm": 0.70703125, + "learning_rate": 9.430322580645162e-05, + "loss": 0.125, + "step": 4033 + }, + { + "epoch": 0.064544, + "grad_norm": 0.890625, + "learning_rate": 9.430161290322582e-05, + "loss": 0.1678, + "step": 4034 + }, + { + "epoch": 0.06456, + "grad_norm": 0.7109375, + "learning_rate": 9.43e-05, + "loss": 0.1602, + "step": 4035 + }, + { + "epoch": 0.064576, + "grad_norm": 0.703125, + "learning_rate": 9.42983870967742e-05, + "loss": 0.1786, + "step": 4036 + }, + { + "epoch": 0.064592, + "grad_norm": 1.3515625, + "learning_rate": 9.429677419354839e-05, + "loss": 0.1634, + "step": 4037 + }, + { + "epoch": 0.064608, + "grad_norm": 0.7421875, + "learning_rate": 9.429516129032259e-05, + "loss": 0.1976, + "step": 4038 + }, + { + "epoch": 0.064624, + "grad_norm": 1.2265625, + "learning_rate": 9.429354838709677e-05, + "loss": 0.1673, + "step": 4039 + }, + { + "epoch": 0.06464, + "grad_norm": 0.7265625, + "learning_rate": 9.429193548387097e-05, + "loss": 0.1943, + "step": 4040 + }, + { + "epoch": 0.064656, + "grad_norm": 0.76953125, + "learning_rate": 9.429032258064516e-05, + "loss": 0.1874, + "step": 4041 + }, + { + "epoch": 0.064672, + "grad_norm": 0.703125, + "learning_rate": 9.428870967741936e-05, + "loss": 0.1781, + "step": 4042 + }, + { + "epoch": 0.064688, + "grad_norm": 0.6328125, + "learning_rate": 9.428709677419354e-05, + "loss": 0.1573, + "step": 4043 + }, + { + "epoch": 0.064704, + "grad_norm": 0.72265625, + "learning_rate": 9.428548387096774e-05, + "loss": 0.1659, + "step": 4044 + }, + { + "epoch": 0.06472, + "grad_norm": 0.68359375, + "learning_rate": 9.428387096774194e-05, + "loss": 0.1342, + "step": 4045 + }, + { + "epoch": 0.064736, + "grad_norm": 0.7734375, + "learning_rate": 9.428225806451614e-05, + "loss": 0.2052, + "step": 4046 + }, + { + "epoch": 0.064752, + "grad_norm": 0.78515625, + "learning_rate": 9.428064516129033e-05, + "loss": 0.2094, + "step": 4047 + }, + { + "epoch": 0.064768, + "grad_norm": 0.54296875, + "learning_rate": 9.427903225806453e-05, + "loss": 0.139, + "step": 4048 + }, + { + "epoch": 0.064784, + "grad_norm": 0.58984375, + "learning_rate": 9.427741935483872e-05, + "loss": 0.1759, + "step": 4049 + }, + { + "epoch": 0.0648, + "grad_norm": 0.7890625, + "learning_rate": 9.42758064516129e-05, + "loss": 0.1651, + "step": 4050 + }, + { + "epoch": 0.064816, + "grad_norm": 0.734375, + "learning_rate": 9.42741935483871e-05, + "loss": 0.1575, + "step": 4051 + }, + { + "epoch": 0.064832, + "grad_norm": 0.62890625, + "learning_rate": 9.427258064516129e-05, + "loss": 0.2137, + "step": 4052 + }, + { + "epoch": 0.064848, + "grad_norm": 1.1328125, + "learning_rate": 9.427096774193549e-05, + "loss": 0.1729, + "step": 4053 + }, + { + "epoch": 0.064864, + "grad_norm": 0.76171875, + "learning_rate": 9.426935483870967e-05, + "loss": 0.1628, + "step": 4054 + }, + { + "epoch": 0.06488, + "grad_norm": 0.9609375, + "learning_rate": 9.426774193548387e-05, + "loss": 0.1486, + "step": 4055 + }, + { + "epoch": 0.064896, + "grad_norm": 1.1484375, + "learning_rate": 9.426612903225807e-05, + "loss": 0.1639, + "step": 4056 + }, + { + "epoch": 0.064912, + "grad_norm": 1.1640625, + "learning_rate": 9.426451612903227e-05, + "loss": 0.1925, + "step": 4057 + }, + { + "epoch": 0.064928, + "grad_norm": 0.578125, + "learning_rate": 9.426290322580646e-05, + "loss": 0.155, + "step": 4058 + }, + { + "epoch": 0.064944, + "grad_norm": 0.8203125, + "learning_rate": 9.426129032258066e-05, + "loss": 0.1689, + "step": 4059 + }, + { + "epoch": 0.06496, + "grad_norm": 0.546875, + "learning_rate": 9.425967741935484e-05, + "loss": 0.1772, + "step": 4060 + }, + { + "epoch": 0.064976, + "grad_norm": 0.5546875, + "learning_rate": 9.425806451612904e-05, + "loss": 0.1427, + "step": 4061 + }, + { + "epoch": 0.064992, + "grad_norm": 0.75390625, + "learning_rate": 9.425645161290323e-05, + "loss": 0.1535, + "step": 4062 + }, + { + "epoch": 0.065008, + "grad_norm": 1.2109375, + "learning_rate": 9.425483870967743e-05, + "loss": 0.1907, + "step": 4063 + }, + { + "epoch": 0.065024, + "grad_norm": 0.79296875, + "learning_rate": 9.425322580645161e-05, + "loss": 0.1767, + "step": 4064 + }, + { + "epoch": 0.06504, + "grad_norm": 0.79296875, + "learning_rate": 9.42516129032258e-05, + "loss": 0.1755, + "step": 4065 + }, + { + "epoch": 0.065056, + "grad_norm": 0.796875, + "learning_rate": 9.425e-05, + "loss": 0.1739, + "step": 4066 + }, + { + "epoch": 0.065072, + "grad_norm": 0.91015625, + "learning_rate": 9.424838709677419e-05, + "loss": 0.2038, + "step": 4067 + }, + { + "epoch": 0.065088, + "grad_norm": 0.8515625, + "learning_rate": 9.424677419354839e-05, + "loss": 0.1887, + "step": 4068 + }, + { + "epoch": 0.065104, + "grad_norm": 1.1015625, + "learning_rate": 9.424516129032259e-05, + "loss": 0.2033, + "step": 4069 + }, + { + "epoch": 0.06512, + "grad_norm": 0.87109375, + "learning_rate": 9.424354838709679e-05, + "loss": 0.1882, + "step": 4070 + }, + { + "epoch": 0.065136, + "grad_norm": 0.765625, + "learning_rate": 9.424193548387097e-05, + "loss": 0.1828, + "step": 4071 + }, + { + "epoch": 0.065152, + "grad_norm": 0.9375, + "learning_rate": 9.424032258064517e-05, + "loss": 0.1584, + "step": 4072 + }, + { + "epoch": 0.065168, + "grad_norm": 0.86328125, + "learning_rate": 9.423870967741936e-05, + "loss": 0.178, + "step": 4073 + }, + { + "epoch": 0.065184, + "grad_norm": 0.7265625, + "learning_rate": 9.423709677419356e-05, + "loss": 0.1931, + "step": 4074 + }, + { + "epoch": 0.0652, + "grad_norm": 0.78515625, + "learning_rate": 9.423548387096774e-05, + "loss": 0.1882, + "step": 4075 + }, + { + "epoch": 0.065216, + "grad_norm": 0.84765625, + "learning_rate": 9.423387096774194e-05, + "loss": 0.1514, + "step": 4076 + }, + { + "epoch": 0.065232, + "grad_norm": 0.5390625, + "learning_rate": 9.423225806451613e-05, + "loss": 0.1693, + "step": 4077 + }, + { + "epoch": 0.065248, + "grad_norm": 0.8125, + "learning_rate": 9.423064516129033e-05, + "loss": 0.2341, + "step": 4078 + }, + { + "epoch": 0.065264, + "grad_norm": 0.60546875, + "learning_rate": 9.422903225806451e-05, + "loss": 0.1558, + "step": 4079 + }, + { + "epoch": 0.06528, + "grad_norm": 0.9921875, + "learning_rate": 9.422741935483871e-05, + "loss": 0.1552, + "step": 4080 + }, + { + "epoch": 0.065296, + "grad_norm": 1.0625, + "learning_rate": 9.422580645161291e-05, + "loss": 0.2009, + "step": 4081 + }, + { + "epoch": 0.065312, + "grad_norm": 1.046875, + "learning_rate": 9.42241935483871e-05, + "loss": 0.2308, + "step": 4082 + }, + { + "epoch": 0.065328, + "grad_norm": 0.6796875, + "learning_rate": 9.42225806451613e-05, + "loss": 0.1646, + "step": 4083 + }, + { + "epoch": 0.065344, + "grad_norm": 1.0625, + "learning_rate": 9.422096774193549e-05, + "loss": 0.1544, + "step": 4084 + }, + { + "epoch": 0.06536, + "grad_norm": 0.92578125, + "learning_rate": 9.421935483870969e-05, + "loss": 0.1583, + "step": 4085 + }, + { + "epoch": 0.065376, + "grad_norm": 0.69140625, + "learning_rate": 9.421774193548387e-05, + "loss": 0.1552, + "step": 4086 + }, + { + "epoch": 0.065392, + "grad_norm": 1.1953125, + "learning_rate": 9.421612903225807e-05, + "loss": 0.1872, + "step": 4087 + }, + { + "epoch": 0.065408, + "grad_norm": 1.4296875, + "learning_rate": 9.421451612903226e-05, + "loss": 0.1508, + "step": 4088 + }, + { + "epoch": 0.065424, + "grad_norm": 1.375, + "learning_rate": 9.421290322580646e-05, + "loss": 0.1732, + "step": 4089 + }, + { + "epoch": 0.06544, + "grad_norm": 0.71484375, + "learning_rate": 9.421129032258064e-05, + "loss": 0.2067, + "step": 4090 + }, + { + "epoch": 0.065456, + "grad_norm": 0.75390625, + "learning_rate": 9.420967741935484e-05, + "loss": 0.1686, + "step": 4091 + }, + { + "epoch": 0.065472, + "grad_norm": 0.86328125, + "learning_rate": 9.420806451612904e-05, + "loss": 0.2068, + "step": 4092 + }, + { + "epoch": 0.065488, + "grad_norm": 1.015625, + "learning_rate": 9.420645161290324e-05, + "loss": 0.1488, + "step": 4093 + }, + { + "epoch": 0.065504, + "grad_norm": 0.88671875, + "learning_rate": 9.420483870967743e-05, + "loss": 0.2164, + "step": 4094 + }, + { + "epoch": 0.06552, + "grad_norm": 0.84375, + "learning_rate": 9.420322580645163e-05, + "loss": 0.1711, + "step": 4095 + }, + { + "epoch": 0.065536, + "grad_norm": 1.0078125, + "learning_rate": 9.420161290322581e-05, + "loss": 0.153, + "step": 4096 + }, + { + "epoch": 0.065552, + "grad_norm": 1.5, + "learning_rate": 9.42e-05, + "loss": 0.2149, + "step": 4097 + }, + { + "epoch": 0.065568, + "grad_norm": 0.92578125, + "learning_rate": 9.41983870967742e-05, + "loss": 0.2173, + "step": 4098 + }, + { + "epoch": 0.065584, + "grad_norm": 1.375, + "learning_rate": 9.419677419354839e-05, + "loss": 0.1858, + "step": 4099 + }, + { + "epoch": 0.0656, + "grad_norm": 0.72265625, + "learning_rate": 9.419516129032258e-05, + "loss": 0.2045, + "step": 4100 + }, + { + "epoch": 0.065616, + "grad_norm": 0.94140625, + "learning_rate": 9.419354838709677e-05, + "loss": 0.1909, + "step": 4101 + }, + { + "epoch": 0.065632, + "grad_norm": 0.8515625, + "learning_rate": 9.419193548387097e-05, + "loss": 0.1801, + "step": 4102 + }, + { + "epoch": 0.065648, + "grad_norm": 0.60546875, + "learning_rate": 9.419032258064516e-05, + "loss": 0.1756, + "step": 4103 + }, + { + "epoch": 0.065664, + "grad_norm": 0.8515625, + "learning_rate": 9.418870967741936e-05, + "loss": 0.1851, + "step": 4104 + }, + { + "epoch": 0.06568, + "grad_norm": 1.6640625, + "learning_rate": 9.418709677419356e-05, + "loss": 0.2052, + "step": 4105 + }, + { + "epoch": 0.065696, + "grad_norm": 0.69921875, + "learning_rate": 9.418548387096776e-05, + "loss": 0.2012, + "step": 4106 + }, + { + "epoch": 0.065712, + "grad_norm": 1.1015625, + "learning_rate": 9.418387096774194e-05, + "loss": 0.2412, + "step": 4107 + }, + { + "epoch": 0.065728, + "grad_norm": 0.703125, + "learning_rate": 9.418225806451614e-05, + "loss": 0.1707, + "step": 4108 + }, + { + "epoch": 0.065744, + "grad_norm": 1.421875, + "learning_rate": 9.418064516129033e-05, + "loss": 0.1459, + "step": 4109 + }, + { + "epoch": 0.06576, + "grad_norm": 0.69921875, + "learning_rate": 9.417903225806453e-05, + "loss": 0.2188, + "step": 4110 + }, + { + "epoch": 0.065776, + "grad_norm": 1.109375, + "learning_rate": 9.417741935483871e-05, + "loss": 0.1689, + "step": 4111 + }, + { + "epoch": 0.065792, + "grad_norm": 1.359375, + "learning_rate": 9.41758064516129e-05, + "loss": 0.223, + "step": 4112 + }, + { + "epoch": 0.065808, + "grad_norm": 0.72265625, + "learning_rate": 9.41741935483871e-05, + "loss": 0.1777, + "step": 4113 + }, + { + "epoch": 0.065824, + "grad_norm": 0.78515625, + "learning_rate": 9.417258064516128e-05, + "loss": 0.1627, + "step": 4114 + }, + { + "epoch": 0.06584, + "grad_norm": 0.90234375, + "learning_rate": 9.417096774193548e-05, + "loss": 0.1886, + "step": 4115 + }, + { + "epoch": 0.065856, + "grad_norm": 1.1953125, + "learning_rate": 9.416935483870968e-05, + "loss": 0.1919, + "step": 4116 + }, + { + "epoch": 0.065872, + "grad_norm": 0.8671875, + "learning_rate": 9.416774193548388e-05, + "loss": 0.2483, + "step": 4117 + }, + { + "epoch": 0.065888, + "grad_norm": 0.58203125, + "learning_rate": 9.416612903225807e-05, + "loss": 0.1593, + "step": 4118 + }, + { + "epoch": 0.065904, + "grad_norm": 0.87109375, + "learning_rate": 9.416451612903227e-05, + "loss": 0.1582, + "step": 4119 + }, + { + "epoch": 0.06592, + "grad_norm": 0.6171875, + "learning_rate": 9.416290322580646e-05, + "loss": 0.1956, + "step": 4120 + }, + { + "epoch": 0.065936, + "grad_norm": 0.97265625, + "learning_rate": 9.416129032258065e-05, + "loss": 0.1811, + "step": 4121 + }, + { + "epoch": 0.065952, + "grad_norm": 0.66015625, + "learning_rate": 9.415967741935484e-05, + "loss": 0.1431, + "step": 4122 + }, + { + "epoch": 0.065968, + "grad_norm": 0.99609375, + "learning_rate": 9.415806451612904e-05, + "loss": 0.2013, + "step": 4123 + }, + { + "epoch": 0.065984, + "grad_norm": 0.78515625, + "learning_rate": 9.415645161290323e-05, + "loss": 0.1959, + "step": 4124 + }, + { + "epoch": 0.066, + "grad_norm": 1.3515625, + "learning_rate": 9.415483870967743e-05, + "loss": 0.1821, + "step": 4125 + }, + { + "epoch": 0.066016, + "grad_norm": 0.9765625, + "learning_rate": 9.415322580645161e-05, + "loss": 0.1872, + "step": 4126 + }, + { + "epoch": 0.066032, + "grad_norm": 1.2734375, + "learning_rate": 9.415161290322581e-05, + "loss": 0.1742, + "step": 4127 + }, + { + "epoch": 0.066048, + "grad_norm": 0.859375, + "learning_rate": 9.415e-05, + "loss": 0.1718, + "step": 4128 + }, + { + "epoch": 0.066064, + "grad_norm": 0.8125, + "learning_rate": 9.41483870967742e-05, + "loss": 0.179, + "step": 4129 + }, + { + "epoch": 0.06608, + "grad_norm": 1.046875, + "learning_rate": 9.41467741935484e-05, + "loss": 0.1998, + "step": 4130 + }, + { + "epoch": 0.066096, + "grad_norm": 0.8046875, + "learning_rate": 9.414516129032258e-05, + "loss": 0.1861, + "step": 4131 + }, + { + "epoch": 0.066112, + "grad_norm": 0.73046875, + "learning_rate": 9.414354838709678e-05, + "loss": 0.1546, + "step": 4132 + }, + { + "epoch": 0.066128, + "grad_norm": 0.91015625, + "learning_rate": 9.414193548387097e-05, + "loss": 0.186, + "step": 4133 + }, + { + "epoch": 0.066144, + "grad_norm": 0.859375, + "learning_rate": 9.414032258064517e-05, + "loss": 0.1694, + "step": 4134 + }, + { + "epoch": 0.06616, + "grad_norm": 0.97265625, + "learning_rate": 9.413870967741935e-05, + "loss": 0.1826, + "step": 4135 + }, + { + "epoch": 0.066176, + "grad_norm": 1.078125, + "learning_rate": 9.413709677419355e-05, + "loss": 0.1538, + "step": 4136 + }, + { + "epoch": 0.066192, + "grad_norm": 1.2109375, + "learning_rate": 9.413548387096774e-05, + "loss": 0.2155, + "step": 4137 + }, + { + "epoch": 0.066208, + "grad_norm": 0.94140625, + "learning_rate": 9.413387096774194e-05, + "loss": 0.2216, + "step": 4138 + }, + { + "epoch": 0.066224, + "grad_norm": 0.91796875, + "learning_rate": 9.413225806451613e-05, + "loss": 0.1704, + "step": 4139 + }, + { + "epoch": 0.06624, + "grad_norm": 0.71875, + "learning_rate": 9.413064516129033e-05, + "loss": 0.2068, + "step": 4140 + }, + { + "epoch": 0.066256, + "grad_norm": 0.81640625, + "learning_rate": 9.412903225806453e-05, + "loss": 0.185, + "step": 4141 + }, + { + "epoch": 0.066272, + "grad_norm": 0.94140625, + "learning_rate": 9.412741935483873e-05, + "loss": 0.1907, + "step": 4142 + }, + { + "epoch": 0.066288, + "grad_norm": 1.3125, + "learning_rate": 9.412580645161291e-05, + "loss": 0.2148, + "step": 4143 + }, + { + "epoch": 0.066304, + "grad_norm": 0.75, + "learning_rate": 9.41241935483871e-05, + "loss": 0.1775, + "step": 4144 + }, + { + "epoch": 0.06632, + "grad_norm": 0.69140625, + "learning_rate": 9.41225806451613e-05, + "loss": 0.1533, + "step": 4145 + }, + { + "epoch": 0.066336, + "grad_norm": 1.0078125, + "learning_rate": 9.412096774193548e-05, + "loss": 0.1927, + "step": 4146 + }, + { + "epoch": 0.066352, + "grad_norm": 1.609375, + "learning_rate": 9.411935483870968e-05, + "loss": 0.1853, + "step": 4147 + }, + { + "epoch": 0.066368, + "grad_norm": 1.7109375, + "learning_rate": 9.411774193548387e-05, + "loss": 0.198, + "step": 4148 + }, + { + "epoch": 0.066384, + "grad_norm": 1.0078125, + "learning_rate": 9.411612903225807e-05, + "loss": 0.1435, + "step": 4149 + }, + { + "epoch": 0.0664, + "grad_norm": 0.9375, + "learning_rate": 9.411451612903225e-05, + "loss": 0.2319, + "step": 4150 + }, + { + "epoch": 0.066416, + "grad_norm": 0.75390625, + "learning_rate": 9.411290322580645e-05, + "loss": 0.1678, + "step": 4151 + }, + { + "epoch": 0.066432, + "grad_norm": 0.64453125, + "learning_rate": 9.411129032258065e-05, + "loss": 0.1645, + "step": 4152 + }, + { + "epoch": 0.066448, + "grad_norm": 1.7578125, + "learning_rate": 9.410967741935485e-05, + "loss": 0.1772, + "step": 4153 + }, + { + "epoch": 0.066464, + "grad_norm": 1.0625, + "learning_rate": 9.410806451612904e-05, + "loss": 0.1952, + "step": 4154 + }, + { + "epoch": 0.06648, + "grad_norm": 0.859375, + "learning_rate": 9.410645161290324e-05, + "loss": 0.1787, + "step": 4155 + }, + { + "epoch": 0.066496, + "grad_norm": 1.0546875, + "learning_rate": 9.410483870967743e-05, + "loss": 0.1878, + "step": 4156 + }, + { + "epoch": 0.066512, + "grad_norm": 0.85546875, + "learning_rate": 9.410322580645162e-05, + "loss": 0.1614, + "step": 4157 + }, + { + "epoch": 0.066528, + "grad_norm": 0.9140625, + "learning_rate": 9.410161290322581e-05, + "loss": 0.2038, + "step": 4158 + }, + { + "epoch": 0.066544, + "grad_norm": 0.9765625, + "learning_rate": 9.41e-05, + "loss": 0.1834, + "step": 4159 + }, + { + "epoch": 0.06656, + "grad_norm": 0.625, + "learning_rate": 9.40983870967742e-05, + "loss": 0.1759, + "step": 4160 + }, + { + "epoch": 0.066576, + "grad_norm": 0.73828125, + "learning_rate": 9.409677419354838e-05, + "loss": 0.1912, + "step": 4161 + }, + { + "epoch": 0.066592, + "grad_norm": 0.7734375, + "learning_rate": 9.409516129032258e-05, + "loss": 0.1753, + "step": 4162 + }, + { + "epoch": 0.066608, + "grad_norm": 0.625, + "learning_rate": 9.409354838709677e-05, + "loss": 0.188, + "step": 4163 + }, + { + "epoch": 0.066624, + "grad_norm": 0.6171875, + "learning_rate": 9.409193548387097e-05, + "loss": 0.1973, + "step": 4164 + }, + { + "epoch": 0.06664, + "grad_norm": 0.88671875, + "learning_rate": 9.409032258064517e-05, + "loss": 0.1864, + "step": 4165 + }, + { + "epoch": 0.066656, + "grad_norm": 0.89453125, + "learning_rate": 9.408870967741937e-05, + "loss": 0.2044, + "step": 4166 + }, + { + "epoch": 0.066672, + "grad_norm": 0.65625, + "learning_rate": 9.408709677419355e-05, + "loss": 0.1737, + "step": 4167 + }, + { + "epoch": 0.066688, + "grad_norm": 1.5078125, + "learning_rate": 9.408548387096775e-05, + "loss": 0.182, + "step": 4168 + }, + { + "epoch": 0.066704, + "grad_norm": 0.71875, + "learning_rate": 9.408387096774194e-05, + "loss": 0.186, + "step": 4169 + }, + { + "epoch": 0.06672, + "grad_norm": 1.4375, + "learning_rate": 9.408225806451614e-05, + "loss": 0.2534, + "step": 4170 + }, + { + "epoch": 0.066736, + "grad_norm": 1.6328125, + "learning_rate": 9.408064516129032e-05, + "loss": 0.2023, + "step": 4171 + }, + { + "epoch": 0.066752, + "grad_norm": 0.609375, + "learning_rate": 9.407903225806452e-05, + "loss": 0.1629, + "step": 4172 + }, + { + "epoch": 0.066768, + "grad_norm": 0.71484375, + "learning_rate": 9.407741935483871e-05, + "loss": 0.189, + "step": 4173 + }, + { + "epoch": 0.066784, + "grad_norm": 0.84765625, + "learning_rate": 9.40758064516129e-05, + "loss": 0.1662, + "step": 4174 + }, + { + "epoch": 0.0668, + "grad_norm": 0.67578125, + "learning_rate": 9.40741935483871e-05, + "loss": 0.1726, + "step": 4175 + }, + { + "epoch": 0.066816, + "grad_norm": 0.69921875, + "learning_rate": 9.40725806451613e-05, + "loss": 0.1781, + "step": 4176 + }, + { + "epoch": 0.066832, + "grad_norm": 1.609375, + "learning_rate": 9.40709677419355e-05, + "loss": 0.1474, + "step": 4177 + }, + { + "epoch": 0.066848, + "grad_norm": 0.92578125, + "learning_rate": 9.406935483870968e-05, + "loss": 0.1916, + "step": 4178 + }, + { + "epoch": 0.066864, + "grad_norm": 0.8828125, + "learning_rate": 9.406774193548388e-05, + "loss": 0.2129, + "step": 4179 + }, + { + "epoch": 0.06688, + "grad_norm": 1.1796875, + "learning_rate": 9.406612903225807e-05, + "loss": 0.1796, + "step": 4180 + }, + { + "epoch": 0.066896, + "grad_norm": 0.921875, + "learning_rate": 9.406451612903227e-05, + "loss": 0.196, + "step": 4181 + }, + { + "epoch": 0.066912, + "grad_norm": 1.4375, + "learning_rate": 9.406290322580645e-05, + "loss": 0.2385, + "step": 4182 + }, + { + "epoch": 0.066928, + "grad_norm": 0.6953125, + "learning_rate": 9.406129032258065e-05, + "loss": 0.1476, + "step": 4183 + }, + { + "epoch": 0.066944, + "grad_norm": 0.82421875, + "learning_rate": 9.405967741935484e-05, + "loss": 0.1873, + "step": 4184 + }, + { + "epoch": 0.06696, + "grad_norm": 1.125, + "learning_rate": 9.405806451612904e-05, + "loss": 0.1902, + "step": 4185 + }, + { + "epoch": 0.066976, + "grad_norm": 0.68359375, + "learning_rate": 9.405645161290322e-05, + "loss": 0.1416, + "step": 4186 + }, + { + "epoch": 0.066992, + "grad_norm": 1.3046875, + "learning_rate": 9.405483870967742e-05, + "loss": 0.1737, + "step": 4187 + }, + { + "epoch": 0.067008, + "grad_norm": 0.98046875, + "learning_rate": 9.405322580645162e-05, + "loss": 0.2056, + "step": 4188 + }, + { + "epoch": 0.067024, + "grad_norm": 1.1015625, + "learning_rate": 9.405161290322582e-05, + "loss": 0.1641, + "step": 4189 + }, + { + "epoch": 0.06704, + "grad_norm": 1.5546875, + "learning_rate": 9.405000000000001e-05, + "loss": 0.2196, + "step": 4190 + }, + { + "epoch": 0.067056, + "grad_norm": 1.265625, + "learning_rate": 9.40483870967742e-05, + "loss": 0.2038, + "step": 4191 + }, + { + "epoch": 0.067072, + "grad_norm": 0.671875, + "learning_rate": 9.40467741935484e-05, + "loss": 0.1876, + "step": 4192 + }, + { + "epoch": 0.067088, + "grad_norm": 0.953125, + "learning_rate": 9.404516129032258e-05, + "loss": 0.1757, + "step": 4193 + }, + { + "epoch": 0.067104, + "grad_norm": 0.6328125, + "learning_rate": 9.404354838709678e-05, + "loss": 0.1511, + "step": 4194 + }, + { + "epoch": 0.06712, + "grad_norm": 1.0859375, + "learning_rate": 9.404193548387097e-05, + "loss": 0.1925, + "step": 4195 + }, + { + "epoch": 0.067136, + "grad_norm": 1.2890625, + "learning_rate": 9.404032258064517e-05, + "loss": 0.1638, + "step": 4196 + }, + { + "epoch": 0.067152, + "grad_norm": 0.71484375, + "learning_rate": 9.403870967741935e-05, + "loss": 0.1706, + "step": 4197 + }, + { + "epoch": 0.067168, + "grad_norm": 0.6953125, + "learning_rate": 9.403709677419355e-05, + "loss": 0.1821, + "step": 4198 + }, + { + "epoch": 0.067184, + "grad_norm": 0.49609375, + "learning_rate": 9.403548387096774e-05, + "loss": 0.1522, + "step": 4199 + }, + { + "epoch": 0.0672, + "grad_norm": 0.59765625, + "learning_rate": 9.403387096774194e-05, + "loss": 0.1742, + "step": 4200 + }, + { + "epoch": 0.067216, + "grad_norm": 0.828125, + "learning_rate": 9.403225806451614e-05, + "loss": 0.1639, + "step": 4201 + }, + { + "epoch": 0.067232, + "grad_norm": 0.75390625, + "learning_rate": 9.403064516129034e-05, + "loss": 0.1909, + "step": 4202 + }, + { + "epoch": 0.067248, + "grad_norm": 1.1796875, + "learning_rate": 9.402903225806452e-05, + "loss": 0.235, + "step": 4203 + }, + { + "epoch": 0.067264, + "grad_norm": 1.3125, + "learning_rate": 9.402741935483872e-05, + "loss": 0.1739, + "step": 4204 + }, + { + "epoch": 0.06728, + "grad_norm": 1.0078125, + "learning_rate": 9.402580645161291e-05, + "loss": 0.1686, + "step": 4205 + }, + { + "epoch": 0.067296, + "grad_norm": 0.69140625, + "learning_rate": 9.40241935483871e-05, + "loss": 0.1854, + "step": 4206 + }, + { + "epoch": 0.067312, + "grad_norm": 0.96484375, + "learning_rate": 9.40225806451613e-05, + "loss": 0.1897, + "step": 4207 + }, + { + "epoch": 0.067328, + "grad_norm": 1.0390625, + "learning_rate": 9.402096774193548e-05, + "loss": 0.2003, + "step": 4208 + }, + { + "epoch": 0.067344, + "grad_norm": 0.578125, + "learning_rate": 9.401935483870968e-05, + "loss": 0.1636, + "step": 4209 + }, + { + "epoch": 0.06736, + "grad_norm": 1.0390625, + "learning_rate": 9.401774193548387e-05, + "loss": 0.2688, + "step": 4210 + }, + { + "epoch": 0.067376, + "grad_norm": 0.98828125, + "learning_rate": 9.401612903225807e-05, + "loss": 0.211, + "step": 4211 + }, + { + "epoch": 0.067392, + "grad_norm": 0.828125, + "learning_rate": 9.401451612903227e-05, + "loss": 0.2069, + "step": 4212 + }, + { + "epoch": 0.067408, + "grad_norm": 0.73046875, + "learning_rate": 9.401290322580647e-05, + "loss": 0.2139, + "step": 4213 + }, + { + "epoch": 0.067424, + "grad_norm": 0.96875, + "learning_rate": 9.401129032258065e-05, + "loss": 0.151, + "step": 4214 + }, + { + "epoch": 0.06744, + "grad_norm": 1.4296875, + "learning_rate": 9.400967741935485e-05, + "loss": 0.2013, + "step": 4215 + }, + { + "epoch": 0.067456, + "grad_norm": 0.89453125, + "learning_rate": 9.400806451612904e-05, + "loss": 0.2164, + "step": 4216 + }, + { + "epoch": 0.067472, + "grad_norm": 0.84765625, + "learning_rate": 9.400645161290324e-05, + "loss": 0.1923, + "step": 4217 + }, + { + "epoch": 0.067488, + "grad_norm": 1.078125, + "learning_rate": 9.400483870967742e-05, + "loss": 0.1984, + "step": 4218 + }, + { + "epoch": 0.067504, + "grad_norm": 0.83203125, + "learning_rate": 9.400322580645162e-05, + "loss": 0.1762, + "step": 4219 + }, + { + "epoch": 0.06752, + "grad_norm": 0.7421875, + "learning_rate": 9.400161290322581e-05, + "loss": 0.1926, + "step": 4220 + }, + { + "epoch": 0.067536, + "grad_norm": 0.703125, + "learning_rate": 9.4e-05, + "loss": 0.1439, + "step": 4221 + }, + { + "epoch": 0.067552, + "grad_norm": 1.453125, + "learning_rate": 9.39983870967742e-05, + "loss": 0.2253, + "step": 4222 + }, + { + "epoch": 0.067568, + "grad_norm": 1.34375, + "learning_rate": 9.39967741935484e-05, + "loss": 0.1774, + "step": 4223 + }, + { + "epoch": 0.067584, + "grad_norm": 1.15625, + "learning_rate": 9.399516129032258e-05, + "loss": 0.1676, + "step": 4224 + }, + { + "epoch": 0.0676, + "grad_norm": 0.9921875, + "learning_rate": 9.399354838709678e-05, + "loss": 0.1971, + "step": 4225 + }, + { + "epoch": 0.067616, + "grad_norm": 0.80859375, + "learning_rate": 9.399193548387098e-05, + "loss": 0.2343, + "step": 4226 + }, + { + "epoch": 0.067632, + "grad_norm": 1.0234375, + "learning_rate": 9.399032258064517e-05, + "loss": 0.1926, + "step": 4227 + }, + { + "epoch": 0.067648, + "grad_norm": 1.0078125, + "learning_rate": 9.398870967741936e-05, + "loss": 0.1937, + "step": 4228 + }, + { + "epoch": 0.067664, + "grad_norm": 1.0859375, + "learning_rate": 9.398709677419355e-05, + "loss": 0.1513, + "step": 4229 + }, + { + "epoch": 0.06768, + "grad_norm": 0.875, + "learning_rate": 9.398548387096775e-05, + "loss": 0.1704, + "step": 4230 + }, + { + "epoch": 0.067696, + "grad_norm": 2.015625, + "learning_rate": 9.398387096774194e-05, + "loss": 0.19, + "step": 4231 + }, + { + "epoch": 0.067712, + "grad_norm": 0.80859375, + "learning_rate": 9.398225806451614e-05, + "loss": 0.1966, + "step": 4232 + }, + { + "epoch": 0.067728, + "grad_norm": 0.94921875, + "learning_rate": 9.398064516129032e-05, + "loss": 0.1754, + "step": 4233 + }, + { + "epoch": 0.067744, + "grad_norm": 0.69921875, + "learning_rate": 9.397903225806452e-05, + "loss": 0.1651, + "step": 4234 + }, + { + "epoch": 0.06776, + "grad_norm": 1.25, + "learning_rate": 9.397741935483871e-05, + "loss": 0.1682, + "step": 4235 + }, + { + "epoch": 0.067776, + "grad_norm": 0.84765625, + "learning_rate": 9.397580645161291e-05, + "loss": 0.1981, + "step": 4236 + }, + { + "epoch": 0.067792, + "grad_norm": 0.52734375, + "learning_rate": 9.397419354838711e-05, + "loss": 0.1399, + "step": 4237 + }, + { + "epoch": 0.067808, + "grad_norm": 0.875, + "learning_rate": 9.39725806451613e-05, + "loss": 0.1939, + "step": 4238 + }, + { + "epoch": 0.067824, + "grad_norm": 1.3046875, + "learning_rate": 9.397096774193549e-05, + "loss": 0.1714, + "step": 4239 + }, + { + "epoch": 0.06784, + "grad_norm": 0.92578125, + "learning_rate": 9.396935483870968e-05, + "loss": 0.2062, + "step": 4240 + }, + { + "epoch": 0.067856, + "grad_norm": 1.1640625, + "learning_rate": 9.396774193548388e-05, + "loss": 0.1985, + "step": 4241 + }, + { + "epoch": 0.067872, + "grad_norm": 0.75390625, + "learning_rate": 9.396612903225806e-05, + "loss": 0.1766, + "step": 4242 + }, + { + "epoch": 0.067888, + "grad_norm": 0.9765625, + "learning_rate": 9.396451612903226e-05, + "loss": 0.1911, + "step": 4243 + }, + { + "epoch": 0.067904, + "grad_norm": 0.94140625, + "learning_rate": 9.396290322580645e-05, + "loss": 0.1815, + "step": 4244 + }, + { + "epoch": 0.06792, + "grad_norm": 0.83203125, + "learning_rate": 9.396129032258065e-05, + "loss": 0.1987, + "step": 4245 + }, + { + "epoch": 0.067936, + "grad_norm": 0.75390625, + "learning_rate": 9.395967741935484e-05, + "loss": 0.1898, + "step": 4246 + }, + { + "epoch": 0.067952, + "grad_norm": 0.66015625, + "learning_rate": 9.395806451612904e-05, + "loss": 0.2021, + "step": 4247 + }, + { + "epoch": 0.067968, + "grad_norm": 0.71484375, + "learning_rate": 9.395645161290324e-05, + "loss": 0.206, + "step": 4248 + }, + { + "epoch": 0.067984, + "grad_norm": 0.71484375, + "learning_rate": 9.395483870967744e-05, + "loss": 0.1671, + "step": 4249 + }, + { + "epoch": 0.068, + "grad_norm": 1.0390625, + "learning_rate": 9.395322580645162e-05, + "loss": 0.211, + "step": 4250 + }, + { + "epoch": 0.068016, + "grad_norm": 0.8671875, + "learning_rate": 9.395161290322582e-05, + "loss": 0.2425, + "step": 4251 + }, + { + "epoch": 0.068032, + "grad_norm": 0.84375, + "learning_rate": 9.395000000000001e-05, + "loss": 0.2346, + "step": 4252 + }, + { + "epoch": 0.068048, + "grad_norm": 0.68359375, + "learning_rate": 9.394838709677419e-05, + "loss": 0.1733, + "step": 4253 + }, + { + "epoch": 0.068064, + "grad_norm": 0.796875, + "learning_rate": 9.394677419354839e-05, + "loss": 0.1626, + "step": 4254 + }, + { + "epoch": 0.06808, + "grad_norm": 1.1015625, + "learning_rate": 9.394516129032258e-05, + "loss": 0.1445, + "step": 4255 + }, + { + "epoch": 0.068096, + "grad_norm": 1.015625, + "learning_rate": 9.394354838709678e-05, + "loss": 0.2092, + "step": 4256 + }, + { + "epoch": 0.068112, + "grad_norm": 0.75390625, + "learning_rate": 9.394193548387096e-05, + "loss": 0.1582, + "step": 4257 + }, + { + "epoch": 0.068128, + "grad_norm": 0.80859375, + "learning_rate": 9.394032258064516e-05, + "loss": 0.1734, + "step": 4258 + }, + { + "epoch": 0.068144, + "grad_norm": 0.88671875, + "learning_rate": 9.393870967741935e-05, + "loss": 0.1612, + "step": 4259 + }, + { + "epoch": 0.06816, + "grad_norm": 0.65234375, + "learning_rate": 9.393709677419355e-05, + "loss": 0.164, + "step": 4260 + }, + { + "epoch": 0.068176, + "grad_norm": 1.125, + "learning_rate": 9.393548387096775e-05, + "loss": 0.1962, + "step": 4261 + }, + { + "epoch": 0.068192, + "grad_norm": 1.0390625, + "learning_rate": 9.393387096774195e-05, + "loss": 0.1593, + "step": 4262 + }, + { + "epoch": 0.068208, + "grad_norm": 0.5625, + "learning_rate": 9.393225806451613e-05, + "loss": 0.1533, + "step": 4263 + }, + { + "epoch": 0.068224, + "grad_norm": 2.265625, + "learning_rate": 9.393064516129033e-05, + "loss": 0.1683, + "step": 4264 + }, + { + "epoch": 0.06824, + "grad_norm": 1.3984375, + "learning_rate": 9.392903225806452e-05, + "loss": 0.1768, + "step": 4265 + }, + { + "epoch": 0.068256, + "grad_norm": 1.1875, + "learning_rate": 9.392741935483872e-05, + "loss": 0.1576, + "step": 4266 + }, + { + "epoch": 0.068272, + "grad_norm": 0.734375, + "learning_rate": 9.39258064516129e-05, + "loss": 0.2049, + "step": 4267 + }, + { + "epoch": 0.068288, + "grad_norm": 0.77734375, + "learning_rate": 9.392419354838709e-05, + "loss": 0.164, + "step": 4268 + }, + { + "epoch": 0.068304, + "grad_norm": 0.734375, + "learning_rate": 9.392258064516129e-05, + "loss": 0.1873, + "step": 4269 + }, + { + "epoch": 0.06832, + "grad_norm": 1.6796875, + "learning_rate": 9.392096774193548e-05, + "loss": 0.1822, + "step": 4270 + }, + { + "epoch": 0.068336, + "grad_norm": 1.2890625, + "learning_rate": 9.391935483870968e-05, + "loss": 0.1529, + "step": 4271 + }, + { + "epoch": 0.068352, + "grad_norm": 0.69140625, + "learning_rate": 9.391774193548388e-05, + "loss": 0.1803, + "step": 4272 + }, + { + "epoch": 0.068368, + "grad_norm": 1.6171875, + "learning_rate": 9.391612903225808e-05, + "loss": 0.2703, + "step": 4273 + }, + { + "epoch": 0.068384, + "grad_norm": 0.609375, + "learning_rate": 9.391451612903226e-05, + "loss": 0.1338, + "step": 4274 + }, + { + "epoch": 0.0684, + "grad_norm": 1.078125, + "learning_rate": 9.391290322580646e-05, + "loss": 0.1558, + "step": 4275 + }, + { + "epoch": 0.068416, + "grad_norm": 0.87109375, + "learning_rate": 9.391129032258065e-05, + "loss": 0.1996, + "step": 4276 + }, + { + "epoch": 0.068432, + "grad_norm": 2.078125, + "learning_rate": 9.390967741935485e-05, + "loss": 0.2086, + "step": 4277 + }, + { + "epoch": 0.068448, + "grad_norm": 0.84375, + "learning_rate": 9.390806451612903e-05, + "loss": 0.1555, + "step": 4278 + }, + { + "epoch": 0.068464, + "grad_norm": 0.8046875, + "learning_rate": 9.390645161290323e-05, + "loss": 0.1524, + "step": 4279 + }, + { + "epoch": 0.06848, + "grad_norm": 1.28125, + "learning_rate": 9.390483870967742e-05, + "loss": 0.1961, + "step": 4280 + }, + { + "epoch": 0.068496, + "grad_norm": 0.91015625, + "learning_rate": 9.390322580645162e-05, + "loss": 0.1545, + "step": 4281 + }, + { + "epoch": 0.068512, + "grad_norm": 2.09375, + "learning_rate": 9.39016129032258e-05, + "loss": 0.159, + "step": 4282 + }, + { + "epoch": 0.068528, + "grad_norm": 0.8671875, + "learning_rate": 9.39e-05, + "loss": 0.1873, + "step": 4283 + }, + { + "epoch": 0.068544, + "grad_norm": 0.87109375, + "learning_rate": 9.38983870967742e-05, + "loss": 0.1698, + "step": 4284 + }, + { + "epoch": 0.06856, + "grad_norm": 0.97265625, + "learning_rate": 9.389677419354839e-05, + "loss": 0.1749, + "step": 4285 + }, + { + "epoch": 0.068576, + "grad_norm": 0.671875, + "learning_rate": 9.389516129032259e-05, + "loss": 0.2094, + "step": 4286 + }, + { + "epoch": 0.068592, + "grad_norm": 0.8125, + "learning_rate": 9.389354838709678e-05, + "loss": 0.16, + "step": 4287 + }, + { + "epoch": 0.068608, + "grad_norm": 0.80859375, + "learning_rate": 9.389193548387098e-05, + "loss": 0.1802, + "step": 4288 + }, + { + "epoch": 0.068624, + "grad_norm": 0.625, + "learning_rate": 9.389032258064516e-05, + "loss": 0.1427, + "step": 4289 + }, + { + "epoch": 0.06864, + "grad_norm": 0.73046875, + "learning_rate": 9.388870967741936e-05, + "loss": 0.1872, + "step": 4290 + }, + { + "epoch": 0.068656, + "grad_norm": 0.99609375, + "learning_rate": 9.388709677419355e-05, + "loss": 0.1894, + "step": 4291 + }, + { + "epoch": 0.068672, + "grad_norm": 0.68359375, + "learning_rate": 9.388548387096775e-05, + "loss": 0.2049, + "step": 4292 + }, + { + "epoch": 0.068688, + "grad_norm": 1.015625, + "learning_rate": 9.388387096774193e-05, + "loss": 0.1657, + "step": 4293 + }, + { + "epoch": 0.068704, + "grad_norm": 0.9453125, + "learning_rate": 9.388225806451613e-05, + "loss": 0.1632, + "step": 4294 + }, + { + "epoch": 0.06872, + "grad_norm": 0.97265625, + "learning_rate": 9.388064516129032e-05, + "loss": 0.1935, + "step": 4295 + }, + { + "epoch": 0.068736, + "grad_norm": 1.0, + "learning_rate": 9.387903225806452e-05, + "loss": 0.1591, + "step": 4296 + }, + { + "epoch": 0.068752, + "grad_norm": 0.89453125, + "learning_rate": 9.387741935483872e-05, + "loss": 0.1721, + "step": 4297 + }, + { + "epoch": 0.068768, + "grad_norm": 1.1171875, + "learning_rate": 9.387580645161292e-05, + "loss": 0.1875, + "step": 4298 + }, + { + "epoch": 0.068784, + "grad_norm": 0.78125, + "learning_rate": 9.38741935483871e-05, + "loss": 0.1667, + "step": 4299 + }, + { + "epoch": 0.0688, + "grad_norm": 0.5390625, + "learning_rate": 9.387258064516129e-05, + "loss": 0.1645, + "step": 4300 + }, + { + "epoch": 0.068816, + "grad_norm": 0.82421875, + "learning_rate": 9.387096774193549e-05, + "loss": 0.1597, + "step": 4301 + }, + { + "epoch": 0.068832, + "grad_norm": 1.5390625, + "learning_rate": 9.386935483870968e-05, + "loss": 0.1747, + "step": 4302 + }, + { + "epoch": 0.068848, + "grad_norm": 0.55078125, + "learning_rate": 9.386774193548388e-05, + "loss": 0.1441, + "step": 4303 + }, + { + "epoch": 0.068864, + "grad_norm": 0.92578125, + "learning_rate": 9.386612903225806e-05, + "loss": 0.1817, + "step": 4304 + }, + { + "epoch": 0.06888, + "grad_norm": 0.81640625, + "learning_rate": 9.386451612903226e-05, + "loss": 0.1505, + "step": 4305 + }, + { + "epoch": 0.068896, + "grad_norm": 0.6171875, + "learning_rate": 9.386290322580645e-05, + "loss": 0.1634, + "step": 4306 + }, + { + "epoch": 0.068912, + "grad_norm": 1.09375, + "learning_rate": 9.386129032258065e-05, + "loss": 0.2073, + "step": 4307 + }, + { + "epoch": 0.068928, + "grad_norm": 0.765625, + "learning_rate": 9.385967741935485e-05, + "loss": 0.2143, + "step": 4308 + }, + { + "epoch": 0.068944, + "grad_norm": 1.15625, + "learning_rate": 9.385806451612905e-05, + "loss": 0.1778, + "step": 4309 + }, + { + "epoch": 0.06896, + "grad_norm": 0.8359375, + "learning_rate": 9.385645161290323e-05, + "loss": 0.1904, + "step": 4310 + }, + { + "epoch": 0.068976, + "grad_norm": 0.5546875, + "learning_rate": 9.385483870967743e-05, + "loss": 0.1375, + "step": 4311 + }, + { + "epoch": 0.068992, + "grad_norm": 0.796875, + "learning_rate": 9.385322580645162e-05, + "loss": 0.1612, + "step": 4312 + }, + { + "epoch": 0.069008, + "grad_norm": 1.0390625, + "learning_rate": 9.385161290322582e-05, + "loss": 0.2173, + "step": 4313 + }, + { + "epoch": 0.069024, + "grad_norm": 0.6171875, + "learning_rate": 9.385e-05, + "loss": 0.1821, + "step": 4314 + }, + { + "epoch": 0.06904, + "grad_norm": 1.328125, + "learning_rate": 9.384838709677419e-05, + "loss": 0.2108, + "step": 4315 + }, + { + "epoch": 0.069056, + "grad_norm": 0.88671875, + "learning_rate": 9.384677419354839e-05, + "loss": 0.1485, + "step": 4316 + }, + { + "epoch": 0.069072, + "grad_norm": 0.95703125, + "learning_rate": 9.384516129032258e-05, + "loss": 0.187, + "step": 4317 + }, + { + "epoch": 0.069088, + "grad_norm": 0.8984375, + "learning_rate": 9.384354838709678e-05, + "loss": 0.1694, + "step": 4318 + }, + { + "epoch": 0.069104, + "grad_norm": 0.9609375, + "learning_rate": 9.384193548387096e-05, + "loss": 0.1653, + "step": 4319 + }, + { + "epoch": 0.06912, + "grad_norm": 0.91796875, + "learning_rate": 9.384032258064516e-05, + "loss": 0.1897, + "step": 4320 + }, + { + "epoch": 0.069136, + "grad_norm": 0.88671875, + "learning_rate": 9.383870967741936e-05, + "loss": 0.1778, + "step": 4321 + }, + { + "epoch": 0.069152, + "grad_norm": 0.90625, + "learning_rate": 9.383709677419356e-05, + "loss": 0.1981, + "step": 4322 + }, + { + "epoch": 0.069168, + "grad_norm": 0.8203125, + "learning_rate": 9.383548387096775e-05, + "loss": 0.1785, + "step": 4323 + }, + { + "epoch": 0.069184, + "grad_norm": 0.85546875, + "learning_rate": 9.383387096774195e-05, + "loss": 0.1642, + "step": 4324 + }, + { + "epoch": 0.0692, + "grad_norm": 0.82421875, + "learning_rate": 9.383225806451613e-05, + "loss": 0.1858, + "step": 4325 + }, + { + "epoch": 0.069216, + "grad_norm": 1.0, + "learning_rate": 9.383064516129033e-05, + "loss": 0.1916, + "step": 4326 + }, + { + "epoch": 0.069232, + "grad_norm": 1.1640625, + "learning_rate": 9.382903225806452e-05, + "loss": 0.2054, + "step": 4327 + }, + { + "epoch": 0.069248, + "grad_norm": 0.80078125, + "learning_rate": 9.382741935483872e-05, + "loss": 0.1601, + "step": 4328 + }, + { + "epoch": 0.069264, + "grad_norm": 0.81640625, + "learning_rate": 9.38258064516129e-05, + "loss": 0.1709, + "step": 4329 + }, + { + "epoch": 0.06928, + "grad_norm": 1.125, + "learning_rate": 9.382419354838709e-05, + "loss": 0.1685, + "step": 4330 + }, + { + "epoch": 0.069296, + "grad_norm": 1.1015625, + "learning_rate": 9.382258064516129e-05, + "loss": 0.1932, + "step": 4331 + }, + { + "epoch": 0.069312, + "grad_norm": 1.046875, + "learning_rate": 9.382096774193549e-05, + "loss": 0.2069, + "step": 4332 + }, + { + "epoch": 0.069328, + "grad_norm": 0.96875, + "learning_rate": 9.381935483870969e-05, + "loss": 0.1812, + "step": 4333 + }, + { + "epoch": 0.069344, + "grad_norm": 0.859375, + "learning_rate": 9.381774193548388e-05, + "loss": 0.1836, + "step": 4334 + }, + { + "epoch": 0.06936, + "grad_norm": 0.953125, + "learning_rate": 9.381612903225807e-05, + "loss": 0.1766, + "step": 4335 + }, + { + "epoch": 0.069376, + "grad_norm": 0.890625, + "learning_rate": 9.381451612903226e-05, + "loss": 0.1661, + "step": 4336 + }, + { + "epoch": 0.069392, + "grad_norm": 0.6015625, + "learning_rate": 9.381290322580646e-05, + "loss": 0.1672, + "step": 4337 + }, + { + "epoch": 0.069408, + "grad_norm": 0.73828125, + "learning_rate": 9.381129032258065e-05, + "loss": 0.1441, + "step": 4338 + }, + { + "epoch": 0.069424, + "grad_norm": 0.9765625, + "learning_rate": 9.380967741935485e-05, + "loss": 0.2262, + "step": 4339 + }, + { + "epoch": 0.06944, + "grad_norm": 0.8046875, + "learning_rate": 9.380806451612903e-05, + "loss": 0.1484, + "step": 4340 + }, + { + "epoch": 0.069456, + "grad_norm": 0.703125, + "learning_rate": 9.380645161290323e-05, + "loss": 0.172, + "step": 4341 + }, + { + "epoch": 0.069472, + "grad_norm": 0.984375, + "learning_rate": 9.380483870967742e-05, + "loss": 0.1939, + "step": 4342 + }, + { + "epoch": 0.069488, + "grad_norm": 1.2109375, + "learning_rate": 9.380322580645162e-05, + "loss": 0.2012, + "step": 4343 + }, + { + "epoch": 0.069504, + "grad_norm": 0.67578125, + "learning_rate": 9.380161290322582e-05, + "loss": 0.1971, + "step": 4344 + }, + { + "epoch": 0.06952, + "grad_norm": 0.88671875, + "learning_rate": 9.38e-05, + "loss": 0.2057, + "step": 4345 + }, + { + "epoch": 0.069536, + "grad_norm": 1.0859375, + "learning_rate": 9.37983870967742e-05, + "loss": 0.1869, + "step": 4346 + }, + { + "epoch": 0.069552, + "grad_norm": 0.77734375, + "learning_rate": 9.379677419354839e-05, + "loss": 0.1954, + "step": 4347 + }, + { + "epoch": 0.069568, + "grad_norm": 0.86328125, + "learning_rate": 9.379516129032259e-05, + "loss": 0.1951, + "step": 4348 + }, + { + "epoch": 0.069584, + "grad_norm": 0.7109375, + "learning_rate": 9.379354838709677e-05, + "loss": 0.2255, + "step": 4349 + }, + { + "epoch": 0.0696, + "grad_norm": 0.609375, + "learning_rate": 9.379193548387097e-05, + "loss": 0.2033, + "step": 4350 + }, + { + "epoch": 0.069616, + "grad_norm": 0.71484375, + "learning_rate": 9.379032258064516e-05, + "loss": 0.16, + "step": 4351 + }, + { + "epoch": 0.069632, + "grad_norm": 0.7734375, + "learning_rate": 9.378870967741936e-05, + "loss": 0.1631, + "step": 4352 + }, + { + "epoch": 0.069648, + "grad_norm": 0.61328125, + "learning_rate": 9.378709677419355e-05, + "loss": 0.1322, + "step": 4353 + }, + { + "epoch": 0.069664, + "grad_norm": 0.76171875, + "learning_rate": 9.378548387096775e-05, + "loss": 0.1567, + "step": 4354 + }, + { + "epoch": 0.06968, + "grad_norm": 1.5078125, + "learning_rate": 9.378387096774193e-05, + "loss": 0.1737, + "step": 4355 + }, + { + "epoch": 0.069696, + "grad_norm": 1.25, + "learning_rate": 9.378225806451613e-05, + "loss": 0.2103, + "step": 4356 + }, + { + "epoch": 0.069712, + "grad_norm": 1.3671875, + "learning_rate": 9.378064516129033e-05, + "loss": 0.2004, + "step": 4357 + }, + { + "epoch": 0.069728, + "grad_norm": 0.70703125, + "learning_rate": 9.377903225806453e-05, + "loss": 0.2031, + "step": 4358 + }, + { + "epoch": 0.069744, + "grad_norm": 0.83203125, + "learning_rate": 9.377741935483872e-05, + "loss": 0.1902, + "step": 4359 + }, + { + "epoch": 0.06976, + "grad_norm": 0.58203125, + "learning_rate": 9.377580645161292e-05, + "loss": 0.1422, + "step": 4360 + }, + { + "epoch": 0.069776, + "grad_norm": 0.734375, + "learning_rate": 9.37741935483871e-05, + "loss": 0.1809, + "step": 4361 + }, + { + "epoch": 0.069792, + "grad_norm": 0.953125, + "learning_rate": 9.377258064516129e-05, + "loss": 0.1969, + "step": 4362 + }, + { + "epoch": 0.069808, + "grad_norm": 0.9375, + "learning_rate": 9.377096774193549e-05, + "loss": 0.1835, + "step": 4363 + }, + { + "epoch": 0.069824, + "grad_norm": 1.046875, + "learning_rate": 9.376935483870967e-05, + "loss": 0.1747, + "step": 4364 + }, + { + "epoch": 0.06984, + "grad_norm": 0.6953125, + "learning_rate": 9.376774193548387e-05, + "loss": 0.2205, + "step": 4365 + }, + { + "epoch": 0.069856, + "grad_norm": 0.94921875, + "learning_rate": 9.376612903225806e-05, + "loss": 0.2102, + "step": 4366 + }, + { + "epoch": 0.069872, + "grad_norm": 0.90234375, + "learning_rate": 9.376451612903226e-05, + "loss": 0.1867, + "step": 4367 + }, + { + "epoch": 0.069888, + "grad_norm": 0.5625, + "learning_rate": 9.376290322580646e-05, + "loss": 0.1398, + "step": 4368 + }, + { + "epoch": 0.069904, + "grad_norm": 0.75390625, + "learning_rate": 9.376129032258066e-05, + "loss": 0.1947, + "step": 4369 + }, + { + "epoch": 0.06992, + "grad_norm": 0.83984375, + "learning_rate": 9.375967741935484e-05, + "loss": 0.2051, + "step": 4370 + }, + { + "epoch": 0.069936, + "grad_norm": 1.40625, + "learning_rate": 9.375806451612904e-05, + "loss": 0.1917, + "step": 4371 + }, + { + "epoch": 0.069952, + "grad_norm": 2.09375, + "learning_rate": 9.375645161290323e-05, + "loss": 0.2061, + "step": 4372 + }, + { + "epoch": 0.069968, + "grad_norm": 0.640625, + "learning_rate": 9.375483870967743e-05, + "loss": 0.1967, + "step": 4373 + }, + { + "epoch": 0.069984, + "grad_norm": 0.8828125, + "learning_rate": 9.375322580645162e-05, + "loss": 0.1616, + "step": 4374 + }, + { + "epoch": 0.07, + "grad_norm": 0.875, + "learning_rate": 9.375161290322582e-05, + "loss": 0.1853, + "step": 4375 + }, + { + "epoch": 0.070016, + "grad_norm": 0.98828125, + "learning_rate": 9.375e-05, + "loss": 0.1843, + "step": 4376 + }, + { + "epoch": 0.070032, + "grad_norm": 0.80078125, + "learning_rate": 9.374838709677419e-05, + "loss": 0.1871, + "step": 4377 + }, + { + "epoch": 0.070048, + "grad_norm": 0.953125, + "learning_rate": 9.374677419354839e-05, + "loss": 0.2112, + "step": 4378 + }, + { + "epoch": 0.070064, + "grad_norm": 1.0078125, + "learning_rate": 9.374516129032259e-05, + "loss": 0.1359, + "step": 4379 + }, + { + "epoch": 0.07008, + "grad_norm": 0.76953125, + "learning_rate": 9.374354838709677e-05, + "loss": 0.1588, + "step": 4380 + }, + { + "epoch": 0.070096, + "grad_norm": 0.62890625, + "learning_rate": 9.374193548387097e-05, + "loss": 0.1765, + "step": 4381 + }, + { + "epoch": 0.070112, + "grad_norm": 1.1171875, + "learning_rate": 9.374032258064517e-05, + "loss": 0.1701, + "step": 4382 + }, + { + "epoch": 0.070128, + "grad_norm": 0.98828125, + "learning_rate": 9.373870967741936e-05, + "loss": 0.1683, + "step": 4383 + }, + { + "epoch": 0.070144, + "grad_norm": 1.3046875, + "learning_rate": 9.373709677419356e-05, + "loss": 0.1785, + "step": 4384 + }, + { + "epoch": 0.07016, + "grad_norm": 0.74609375, + "learning_rate": 9.373548387096774e-05, + "loss": 0.13, + "step": 4385 + }, + { + "epoch": 0.070176, + "grad_norm": 1.0546875, + "learning_rate": 9.373387096774194e-05, + "loss": 0.1531, + "step": 4386 + }, + { + "epoch": 0.070192, + "grad_norm": 0.71484375, + "learning_rate": 9.373225806451613e-05, + "loss": 0.157, + "step": 4387 + }, + { + "epoch": 0.070208, + "grad_norm": 1.0078125, + "learning_rate": 9.373064516129033e-05, + "loss": 0.1675, + "step": 4388 + }, + { + "epoch": 0.070224, + "grad_norm": 0.875, + "learning_rate": 9.372903225806452e-05, + "loss": 0.1959, + "step": 4389 + }, + { + "epoch": 0.07024, + "grad_norm": 0.68359375, + "learning_rate": 9.372741935483872e-05, + "loss": 0.1781, + "step": 4390 + }, + { + "epoch": 0.070256, + "grad_norm": 1.4375, + "learning_rate": 9.37258064516129e-05, + "loss": 0.1916, + "step": 4391 + }, + { + "epoch": 0.070272, + "grad_norm": 0.76953125, + "learning_rate": 9.37241935483871e-05, + "loss": 0.161, + "step": 4392 + }, + { + "epoch": 0.070288, + "grad_norm": 0.6796875, + "learning_rate": 9.37225806451613e-05, + "loss": 0.1662, + "step": 4393 + }, + { + "epoch": 0.070304, + "grad_norm": 1.0625, + "learning_rate": 9.372096774193549e-05, + "loss": 0.2058, + "step": 4394 + }, + { + "epoch": 0.07032, + "grad_norm": 1.46875, + "learning_rate": 9.371935483870969e-05, + "loss": 0.1806, + "step": 4395 + }, + { + "epoch": 0.070336, + "grad_norm": 0.87890625, + "learning_rate": 9.371774193548387e-05, + "loss": 0.1784, + "step": 4396 + }, + { + "epoch": 0.070352, + "grad_norm": 0.85546875, + "learning_rate": 9.371612903225807e-05, + "loss": 0.1853, + "step": 4397 + }, + { + "epoch": 0.070368, + "grad_norm": 0.95703125, + "learning_rate": 9.371451612903226e-05, + "loss": 0.1488, + "step": 4398 + }, + { + "epoch": 0.070384, + "grad_norm": 0.875, + "learning_rate": 9.371290322580646e-05, + "loss": 0.2069, + "step": 4399 + }, + { + "epoch": 0.0704, + "grad_norm": 1.0, + "learning_rate": 9.371129032258064e-05, + "loss": 0.2079, + "step": 4400 + }, + { + "epoch": 0.070416, + "grad_norm": 1.0234375, + "learning_rate": 9.370967741935484e-05, + "loss": 0.2197, + "step": 4401 + }, + { + "epoch": 0.070432, + "grad_norm": 0.640625, + "learning_rate": 9.370806451612903e-05, + "loss": 0.1127, + "step": 4402 + }, + { + "epoch": 0.070448, + "grad_norm": 1.3046875, + "learning_rate": 9.370645161290323e-05, + "loss": 0.207, + "step": 4403 + }, + { + "epoch": 0.070464, + "grad_norm": 0.85546875, + "learning_rate": 9.370483870967743e-05, + "loss": 0.1969, + "step": 4404 + }, + { + "epoch": 0.07048, + "grad_norm": 0.93359375, + "learning_rate": 9.370322580645163e-05, + "loss": 0.1382, + "step": 4405 + }, + { + "epoch": 0.070496, + "grad_norm": 1.15625, + "learning_rate": 9.370161290322581e-05, + "loss": 0.2109, + "step": 4406 + }, + { + "epoch": 0.070512, + "grad_norm": 0.953125, + "learning_rate": 9.370000000000001e-05, + "loss": 0.1845, + "step": 4407 + }, + { + "epoch": 0.070528, + "grad_norm": 0.9296875, + "learning_rate": 9.36983870967742e-05, + "loss": 0.1489, + "step": 4408 + }, + { + "epoch": 0.070544, + "grad_norm": 0.671875, + "learning_rate": 9.369677419354839e-05, + "loss": 0.1734, + "step": 4409 + }, + { + "epoch": 0.07056, + "grad_norm": 1.09375, + "learning_rate": 9.369516129032259e-05, + "loss": 0.2131, + "step": 4410 + }, + { + "epoch": 0.070576, + "grad_norm": 0.63671875, + "learning_rate": 9.369354838709677e-05, + "loss": 0.1437, + "step": 4411 + }, + { + "epoch": 0.070592, + "grad_norm": 0.94921875, + "learning_rate": 9.369193548387097e-05, + "loss": 0.1638, + "step": 4412 + }, + { + "epoch": 0.070608, + "grad_norm": 0.87109375, + "learning_rate": 9.369032258064516e-05, + "loss": 0.1406, + "step": 4413 + }, + { + "epoch": 0.070624, + "grad_norm": 1.03125, + "learning_rate": 9.368870967741936e-05, + "loss": 0.1683, + "step": 4414 + }, + { + "epoch": 0.07064, + "grad_norm": 0.59765625, + "learning_rate": 9.368709677419354e-05, + "loss": 0.1743, + "step": 4415 + }, + { + "epoch": 0.070656, + "grad_norm": 1.25, + "learning_rate": 9.368548387096774e-05, + "loss": 0.2132, + "step": 4416 + }, + { + "epoch": 0.070672, + "grad_norm": 1.3046875, + "learning_rate": 9.368387096774194e-05, + "loss": 0.1973, + "step": 4417 + }, + { + "epoch": 0.070688, + "grad_norm": 0.90234375, + "learning_rate": 9.368225806451614e-05, + "loss": 0.1713, + "step": 4418 + }, + { + "epoch": 0.070704, + "grad_norm": 1.0, + "learning_rate": 9.368064516129033e-05, + "loss": 0.1639, + "step": 4419 + }, + { + "epoch": 0.07072, + "grad_norm": 0.75, + "learning_rate": 9.367903225806453e-05, + "loss": 0.1468, + "step": 4420 + }, + { + "epoch": 0.070736, + "grad_norm": 0.78515625, + "learning_rate": 9.367741935483871e-05, + "loss": 0.2049, + "step": 4421 + }, + { + "epoch": 0.070752, + "grad_norm": 1.3828125, + "learning_rate": 9.367580645161291e-05, + "loss": 0.181, + "step": 4422 + }, + { + "epoch": 0.070768, + "grad_norm": 1.4375, + "learning_rate": 9.36741935483871e-05, + "loss": 0.2431, + "step": 4423 + }, + { + "epoch": 0.070784, + "grad_norm": 0.89453125, + "learning_rate": 9.367258064516129e-05, + "loss": 0.1789, + "step": 4424 + }, + { + "epoch": 0.0708, + "grad_norm": 0.9921875, + "learning_rate": 9.367096774193549e-05, + "loss": 0.1442, + "step": 4425 + }, + { + "epoch": 0.070816, + "grad_norm": 1.0390625, + "learning_rate": 9.366935483870967e-05, + "loss": 0.1458, + "step": 4426 + }, + { + "epoch": 0.070832, + "grad_norm": 0.703125, + "learning_rate": 9.366774193548387e-05, + "loss": 0.1741, + "step": 4427 + }, + { + "epoch": 0.070848, + "grad_norm": 0.7421875, + "learning_rate": 9.366612903225807e-05, + "loss": 0.1761, + "step": 4428 + }, + { + "epoch": 0.070864, + "grad_norm": 0.8515625, + "learning_rate": 9.366451612903227e-05, + "loss": 0.1748, + "step": 4429 + }, + { + "epoch": 0.07088, + "grad_norm": 1.28125, + "learning_rate": 9.366290322580646e-05, + "loss": 0.1597, + "step": 4430 + }, + { + "epoch": 0.070896, + "grad_norm": 0.98828125, + "learning_rate": 9.366129032258066e-05, + "loss": 0.1633, + "step": 4431 + }, + { + "epoch": 0.070912, + "grad_norm": 0.7734375, + "learning_rate": 9.365967741935484e-05, + "loss": 0.2082, + "step": 4432 + }, + { + "epoch": 0.070928, + "grad_norm": 0.7265625, + "learning_rate": 9.365806451612904e-05, + "loss": 0.1778, + "step": 4433 + }, + { + "epoch": 0.070944, + "grad_norm": 0.953125, + "learning_rate": 9.365645161290323e-05, + "loss": 0.1496, + "step": 4434 + }, + { + "epoch": 0.07096, + "grad_norm": 1.1953125, + "learning_rate": 9.365483870967743e-05, + "loss": 0.2008, + "step": 4435 + }, + { + "epoch": 0.070976, + "grad_norm": 1.2734375, + "learning_rate": 9.365322580645161e-05, + "loss": 0.204, + "step": 4436 + }, + { + "epoch": 0.070992, + "grad_norm": 0.703125, + "learning_rate": 9.365161290322581e-05, + "loss": 0.1489, + "step": 4437 + }, + { + "epoch": 0.071008, + "grad_norm": 0.53515625, + "learning_rate": 9.365e-05, + "loss": 0.1492, + "step": 4438 + }, + { + "epoch": 0.071024, + "grad_norm": 0.7890625, + "learning_rate": 9.36483870967742e-05, + "loss": 0.1708, + "step": 4439 + }, + { + "epoch": 0.07104, + "grad_norm": 0.6484375, + "learning_rate": 9.36467741935484e-05, + "loss": 0.1806, + "step": 4440 + }, + { + "epoch": 0.071056, + "grad_norm": 1.0390625, + "learning_rate": 9.364516129032258e-05, + "loss": 0.2073, + "step": 4441 + }, + { + "epoch": 0.071072, + "grad_norm": 0.8203125, + "learning_rate": 9.364354838709678e-05, + "loss": 0.1327, + "step": 4442 + }, + { + "epoch": 0.071088, + "grad_norm": 0.52734375, + "learning_rate": 9.364193548387097e-05, + "loss": 0.1554, + "step": 4443 + }, + { + "epoch": 0.071104, + "grad_norm": 0.6875, + "learning_rate": 9.364032258064517e-05, + "loss": 0.1753, + "step": 4444 + }, + { + "epoch": 0.07112, + "grad_norm": 1.234375, + "learning_rate": 9.363870967741936e-05, + "loss": 0.2036, + "step": 4445 + }, + { + "epoch": 0.071136, + "grad_norm": 0.70703125, + "learning_rate": 9.363709677419356e-05, + "loss": 0.1856, + "step": 4446 + }, + { + "epoch": 0.071152, + "grad_norm": 0.890625, + "learning_rate": 9.363548387096774e-05, + "loss": 0.2038, + "step": 4447 + }, + { + "epoch": 0.071168, + "grad_norm": 0.87890625, + "learning_rate": 9.363387096774194e-05, + "loss": 0.233, + "step": 4448 + }, + { + "epoch": 0.071184, + "grad_norm": 1.5546875, + "learning_rate": 9.363225806451613e-05, + "loss": 0.1529, + "step": 4449 + }, + { + "epoch": 0.0712, + "grad_norm": 0.98046875, + "learning_rate": 9.363064516129033e-05, + "loss": 0.1857, + "step": 4450 + }, + { + "epoch": 0.071216, + "grad_norm": 0.66015625, + "learning_rate": 9.362903225806451e-05, + "loss": 0.1768, + "step": 4451 + }, + { + "epoch": 0.071232, + "grad_norm": 0.87890625, + "learning_rate": 9.362741935483871e-05, + "loss": 0.1758, + "step": 4452 + }, + { + "epoch": 0.071248, + "grad_norm": 0.73046875, + "learning_rate": 9.362580645161291e-05, + "loss": 0.2229, + "step": 4453 + }, + { + "epoch": 0.071264, + "grad_norm": 0.7890625, + "learning_rate": 9.36241935483871e-05, + "loss": 0.1625, + "step": 4454 + }, + { + "epoch": 0.07128, + "grad_norm": 0.921875, + "learning_rate": 9.36225806451613e-05, + "loss": 0.2075, + "step": 4455 + }, + { + "epoch": 0.071296, + "grad_norm": 0.5859375, + "learning_rate": 9.362096774193548e-05, + "loss": 0.1529, + "step": 4456 + }, + { + "epoch": 0.071312, + "grad_norm": 0.81640625, + "learning_rate": 9.361935483870968e-05, + "loss": 0.1679, + "step": 4457 + }, + { + "epoch": 0.071328, + "grad_norm": 0.6953125, + "learning_rate": 9.361774193548387e-05, + "loss": 0.1715, + "step": 4458 + }, + { + "epoch": 0.071344, + "grad_norm": 0.6875, + "learning_rate": 9.361612903225807e-05, + "loss": 0.1538, + "step": 4459 + }, + { + "epoch": 0.07136, + "grad_norm": 1.109375, + "learning_rate": 9.361451612903226e-05, + "loss": 0.2023, + "step": 4460 + }, + { + "epoch": 0.071376, + "grad_norm": 0.9140625, + "learning_rate": 9.361290322580646e-05, + "loss": 0.1777, + "step": 4461 + }, + { + "epoch": 0.071392, + "grad_norm": 0.8203125, + "learning_rate": 9.361129032258064e-05, + "loss": 0.2179, + "step": 4462 + }, + { + "epoch": 0.071408, + "grad_norm": 0.9453125, + "learning_rate": 9.360967741935484e-05, + "loss": 0.2347, + "step": 4463 + }, + { + "epoch": 0.071424, + "grad_norm": 0.6484375, + "learning_rate": 9.360806451612904e-05, + "loss": 0.1734, + "step": 4464 + }, + { + "epoch": 0.07144, + "grad_norm": 0.765625, + "learning_rate": 9.360645161290324e-05, + "loss": 0.1861, + "step": 4465 + }, + { + "epoch": 0.071456, + "grad_norm": 1.09375, + "learning_rate": 9.360483870967743e-05, + "loss": 0.2177, + "step": 4466 + }, + { + "epoch": 0.071472, + "grad_norm": 0.76953125, + "learning_rate": 9.360322580645163e-05, + "loss": 0.2088, + "step": 4467 + }, + { + "epoch": 0.071488, + "grad_norm": 0.8046875, + "learning_rate": 9.360161290322581e-05, + "loss": 0.219, + "step": 4468 + }, + { + "epoch": 0.071504, + "grad_norm": 0.80859375, + "learning_rate": 9.360000000000001e-05, + "loss": 0.2024, + "step": 4469 + }, + { + "epoch": 0.07152, + "grad_norm": 0.890625, + "learning_rate": 9.35983870967742e-05, + "loss": 0.1763, + "step": 4470 + }, + { + "epoch": 0.071536, + "grad_norm": 0.76953125, + "learning_rate": 9.359677419354838e-05, + "loss": 0.1731, + "step": 4471 + }, + { + "epoch": 0.071552, + "grad_norm": 0.78125, + "learning_rate": 9.359516129032258e-05, + "loss": 0.1546, + "step": 4472 + }, + { + "epoch": 0.071568, + "grad_norm": 0.6484375, + "learning_rate": 9.359354838709677e-05, + "loss": 0.1563, + "step": 4473 + }, + { + "epoch": 0.071584, + "grad_norm": 1.1875, + "learning_rate": 9.359193548387097e-05, + "loss": 0.2306, + "step": 4474 + }, + { + "epoch": 0.0716, + "grad_norm": 0.60546875, + "learning_rate": 9.359032258064517e-05, + "loss": 0.1362, + "step": 4475 + }, + { + "epoch": 0.071616, + "grad_norm": 0.57421875, + "learning_rate": 9.358870967741936e-05, + "loss": 0.2109, + "step": 4476 + }, + { + "epoch": 0.071632, + "grad_norm": 0.83984375, + "learning_rate": 9.358709677419355e-05, + "loss": 0.1625, + "step": 4477 + }, + { + "epoch": 0.071648, + "grad_norm": 0.8359375, + "learning_rate": 9.358548387096775e-05, + "loss": 0.1635, + "step": 4478 + }, + { + "epoch": 0.071664, + "grad_norm": 0.828125, + "learning_rate": 9.358387096774194e-05, + "loss": 0.167, + "step": 4479 + }, + { + "epoch": 0.07168, + "grad_norm": 0.78125, + "learning_rate": 9.358225806451614e-05, + "loss": 0.1758, + "step": 4480 + }, + { + "epoch": 0.071696, + "grad_norm": 0.7890625, + "learning_rate": 9.358064516129033e-05, + "loss": 0.1619, + "step": 4481 + }, + { + "epoch": 0.071712, + "grad_norm": 1.0703125, + "learning_rate": 9.357903225806453e-05, + "loss": 0.1693, + "step": 4482 + }, + { + "epoch": 0.071728, + "grad_norm": 2.0, + "learning_rate": 9.357741935483871e-05, + "loss": 0.2066, + "step": 4483 + }, + { + "epoch": 0.071744, + "grad_norm": 1.5234375, + "learning_rate": 9.357580645161291e-05, + "loss": 0.1999, + "step": 4484 + }, + { + "epoch": 0.07176, + "grad_norm": 0.80078125, + "learning_rate": 9.35741935483871e-05, + "loss": 0.169, + "step": 4485 + }, + { + "epoch": 0.071776, + "grad_norm": 0.6953125, + "learning_rate": 9.357258064516128e-05, + "loss": 0.1792, + "step": 4486 + }, + { + "epoch": 0.071792, + "grad_norm": 0.94921875, + "learning_rate": 9.357096774193548e-05, + "loss": 0.2401, + "step": 4487 + }, + { + "epoch": 0.071808, + "grad_norm": 1.03125, + "learning_rate": 9.356935483870968e-05, + "loss": 0.2355, + "step": 4488 + }, + { + "epoch": 0.071824, + "grad_norm": 0.9375, + "learning_rate": 9.356774193548388e-05, + "loss": 0.1873, + "step": 4489 + }, + { + "epoch": 0.07184, + "grad_norm": 0.96484375, + "learning_rate": 9.356612903225807e-05, + "loss": 0.2339, + "step": 4490 + }, + { + "epoch": 0.071856, + "grad_norm": 0.65234375, + "learning_rate": 9.356451612903227e-05, + "loss": 0.2063, + "step": 4491 + }, + { + "epoch": 0.071872, + "grad_norm": 0.62890625, + "learning_rate": 9.356290322580645e-05, + "loss": 0.1688, + "step": 4492 + }, + { + "epoch": 0.071888, + "grad_norm": 0.859375, + "learning_rate": 9.356129032258065e-05, + "loss": 0.2028, + "step": 4493 + }, + { + "epoch": 0.071904, + "grad_norm": 0.6328125, + "learning_rate": 9.355967741935484e-05, + "loss": 0.176, + "step": 4494 + }, + { + "epoch": 0.07192, + "grad_norm": 0.8515625, + "learning_rate": 9.355806451612904e-05, + "loss": 0.1905, + "step": 4495 + }, + { + "epoch": 0.071936, + "grad_norm": 0.765625, + "learning_rate": 9.355645161290323e-05, + "loss": 0.1917, + "step": 4496 + }, + { + "epoch": 0.071952, + "grad_norm": 0.7890625, + "learning_rate": 9.355483870967743e-05, + "loss": 0.2038, + "step": 4497 + }, + { + "epoch": 0.071968, + "grad_norm": 0.71875, + "learning_rate": 9.355322580645161e-05, + "loss": 0.2156, + "step": 4498 + }, + { + "epoch": 0.071984, + "grad_norm": 1.265625, + "learning_rate": 9.355161290322581e-05, + "loss": 0.2089, + "step": 4499 + }, + { + "epoch": 0.072, + "grad_norm": 0.93359375, + "learning_rate": 9.355000000000001e-05, + "loss": 0.2278, + "step": 4500 + }, + { + "epoch": 0.072016, + "grad_norm": 1.109375, + "learning_rate": 9.35483870967742e-05, + "loss": 0.2314, + "step": 4501 + }, + { + "epoch": 0.072032, + "grad_norm": 1.3828125, + "learning_rate": 9.35467741935484e-05, + "loss": 0.2024, + "step": 4502 + }, + { + "epoch": 0.072048, + "grad_norm": 1.140625, + "learning_rate": 9.354516129032258e-05, + "loss": 0.1784, + "step": 4503 + }, + { + "epoch": 0.072064, + "grad_norm": 0.78125, + "learning_rate": 9.354354838709678e-05, + "loss": 0.1765, + "step": 4504 + }, + { + "epoch": 0.07208, + "grad_norm": 0.859375, + "learning_rate": 9.354193548387097e-05, + "loss": 0.1864, + "step": 4505 + }, + { + "epoch": 0.072096, + "grad_norm": 0.75390625, + "learning_rate": 9.354032258064517e-05, + "loss": 0.1884, + "step": 4506 + }, + { + "epoch": 0.072112, + "grad_norm": 0.6640625, + "learning_rate": 9.353870967741935e-05, + "loss": 0.1905, + "step": 4507 + }, + { + "epoch": 0.072128, + "grad_norm": 0.8984375, + "learning_rate": 9.353709677419355e-05, + "loss": 0.2015, + "step": 4508 + }, + { + "epoch": 0.072144, + "grad_norm": 0.8125, + "learning_rate": 9.353548387096774e-05, + "loss": 0.1735, + "step": 4509 + }, + { + "epoch": 0.07216, + "grad_norm": 0.9453125, + "learning_rate": 9.353387096774194e-05, + "loss": 0.2059, + "step": 4510 + }, + { + "epoch": 0.072176, + "grad_norm": 1.0078125, + "learning_rate": 9.353225806451613e-05, + "loss": 0.1782, + "step": 4511 + }, + { + "epoch": 0.072192, + "grad_norm": 1.0546875, + "learning_rate": 9.353064516129032e-05, + "loss": 0.1763, + "step": 4512 + }, + { + "epoch": 0.072208, + "grad_norm": 0.9296875, + "learning_rate": 9.352903225806452e-05, + "loss": 0.2022, + "step": 4513 + }, + { + "epoch": 0.072224, + "grad_norm": 0.99609375, + "learning_rate": 9.352741935483872e-05, + "loss": 0.1671, + "step": 4514 + }, + { + "epoch": 0.07224, + "grad_norm": 0.640625, + "learning_rate": 9.352580645161291e-05, + "loss": 0.174, + "step": 4515 + }, + { + "epoch": 0.072256, + "grad_norm": 0.9140625, + "learning_rate": 9.352419354838711e-05, + "loss": 0.1933, + "step": 4516 + }, + { + "epoch": 0.072272, + "grad_norm": 0.75390625, + "learning_rate": 9.35225806451613e-05, + "loss": 0.1947, + "step": 4517 + }, + { + "epoch": 0.072288, + "grad_norm": 0.81640625, + "learning_rate": 9.352096774193548e-05, + "loss": 0.2249, + "step": 4518 + }, + { + "epoch": 0.072304, + "grad_norm": 0.7578125, + "learning_rate": 9.351935483870968e-05, + "loss": 0.1998, + "step": 4519 + }, + { + "epoch": 0.07232, + "grad_norm": 1.078125, + "learning_rate": 9.351774193548387e-05, + "loss": 0.1745, + "step": 4520 + }, + { + "epoch": 0.072336, + "grad_norm": 1.0625, + "learning_rate": 9.351612903225807e-05, + "loss": 0.2467, + "step": 4521 + }, + { + "epoch": 0.072352, + "grad_norm": 0.765625, + "learning_rate": 9.351451612903225e-05, + "loss": 0.1533, + "step": 4522 + }, + { + "epoch": 0.072368, + "grad_norm": 0.64453125, + "learning_rate": 9.351290322580645e-05, + "loss": 0.181, + "step": 4523 + }, + { + "epoch": 0.072384, + "grad_norm": 0.69921875, + "learning_rate": 9.351129032258065e-05, + "loss": 0.1433, + "step": 4524 + }, + { + "epoch": 0.0724, + "grad_norm": 0.82421875, + "learning_rate": 9.350967741935485e-05, + "loss": 0.174, + "step": 4525 + }, + { + "epoch": 0.072416, + "grad_norm": 0.66796875, + "learning_rate": 9.350806451612904e-05, + "loss": 0.1872, + "step": 4526 + }, + { + "epoch": 0.072432, + "grad_norm": 1.0390625, + "learning_rate": 9.350645161290324e-05, + "loss": 0.1842, + "step": 4527 + }, + { + "epoch": 0.072448, + "grad_norm": 0.68359375, + "learning_rate": 9.350483870967742e-05, + "loss": 0.1453, + "step": 4528 + }, + { + "epoch": 0.072464, + "grad_norm": 0.63671875, + "learning_rate": 9.350322580645162e-05, + "loss": 0.1589, + "step": 4529 + }, + { + "epoch": 0.07248, + "grad_norm": 1.109375, + "learning_rate": 9.350161290322581e-05, + "loss": 0.1702, + "step": 4530 + }, + { + "epoch": 0.072496, + "grad_norm": 0.9296875, + "learning_rate": 9.350000000000001e-05, + "loss": 0.2119, + "step": 4531 + }, + { + "epoch": 0.072512, + "grad_norm": 0.86328125, + "learning_rate": 9.34983870967742e-05, + "loss": 0.192, + "step": 4532 + }, + { + "epoch": 0.072528, + "grad_norm": 1.1328125, + "learning_rate": 9.349677419354838e-05, + "loss": 0.1992, + "step": 4533 + }, + { + "epoch": 0.072544, + "grad_norm": 0.91796875, + "learning_rate": 9.349516129032258e-05, + "loss": 0.1851, + "step": 4534 + }, + { + "epoch": 0.07256, + "grad_norm": 0.55859375, + "learning_rate": 9.349354838709678e-05, + "loss": 0.1724, + "step": 4535 + }, + { + "epoch": 0.072576, + "grad_norm": 0.96484375, + "learning_rate": 9.349193548387098e-05, + "loss": 0.179, + "step": 4536 + }, + { + "epoch": 0.072592, + "grad_norm": 0.73828125, + "learning_rate": 9.349032258064517e-05, + "loss": 0.1679, + "step": 4537 + }, + { + "epoch": 0.072608, + "grad_norm": 1.0546875, + "learning_rate": 9.348870967741937e-05, + "loss": 0.1947, + "step": 4538 + }, + { + "epoch": 0.072624, + "grad_norm": 0.640625, + "learning_rate": 9.348709677419355e-05, + "loss": 0.1537, + "step": 4539 + }, + { + "epoch": 0.07264, + "grad_norm": 1.1484375, + "learning_rate": 9.348548387096775e-05, + "loss": 0.1495, + "step": 4540 + }, + { + "epoch": 0.072656, + "grad_norm": 0.71484375, + "learning_rate": 9.348387096774194e-05, + "loss": 0.1879, + "step": 4541 + }, + { + "epoch": 0.072672, + "grad_norm": 0.76953125, + "learning_rate": 9.348225806451614e-05, + "loss": 0.1825, + "step": 4542 + }, + { + "epoch": 0.072688, + "grad_norm": 0.765625, + "learning_rate": 9.348064516129032e-05, + "loss": 0.1895, + "step": 4543 + }, + { + "epoch": 0.072704, + "grad_norm": 0.65625, + "learning_rate": 9.347903225806452e-05, + "loss": 0.1515, + "step": 4544 + }, + { + "epoch": 0.07272, + "grad_norm": 1.109375, + "learning_rate": 9.347741935483871e-05, + "loss": 0.204, + "step": 4545 + }, + { + "epoch": 0.072736, + "grad_norm": 0.83984375, + "learning_rate": 9.347580645161291e-05, + "loss": 0.1793, + "step": 4546 + }, + { + "epoch": 0.072752, + "grad_norm": 0.75, + "learning_rate": 9.34741935483871e-05, + "loss": 0.2165, + "step": 4547 + }, + { + "epoch": 0.072768, + "grad_norm": 1.171875, + "learning_rate": 9.34725806451613e-05, + "loss": 0.245, + "step": 4548 + }, + { + "epoch": 0.072784, + "grad_norm": 0.8984375, + "learning_rate": 9.34709677419355e-05, + "loss": 0.1993, + "step": 4549 + }, + { + "epoch": 0.0728, + "grad_norm": 0.67578125, + "learning_rate": 9.346935483870968e-05, + "loss": 0.1493, + "step": 4550 + }, + { + "epoch": 0.072816, + "grad_norm": 0.72265625, + "learning_rate": 9.346774193548388e-05, + "loss": 0.1381, + "step": 4551 + }, + { + "epoch": 0.072832, + "grad_norm": 1.0703125, + "learning_rate": 9.346612903225807e-05, + "loss": 0.1754, + "step": 4552 + }, + { + "epoch": 0.072848, + "grad_norm": 0.53125, + "learning_rate": 9.346451612903227e-05, + "loss": 0.1488, + "step": 4553 + }, + { + "epoch": 0.072864, + "grad_norm": 0.9375, + "learning_rate": 9.346290322580645e-05, + "loss": 0.2236, + "step": 4554 + }, + { + "epoch": 0.07288, + "grad_norm": 0.61328125, + "learning_rate": 9.346129032258065e-05, + "loss": 0.1527, + "step": 4555 + }, + { + "epoch": 0.072896, + "grad_norm": 0.66015625, + "learning_rate": 9.345967741935484e-05, + "loss": 0.1602, + "step": 4556 + }, + { + "epoch": 0.072912, + "grad_norm": 0.91015625, + "learning_rate": 9.345806451612904e-05, + "loss": 0.1836, + "step": 4557 + }, + { + "epoch": 0.072928, + "grad_norm": 0.99609375, + "learning_rate": 9.345645161290322e-05, + "loss": 0.1942, + "step": 4558 + }, + { + "epoch": 0.072944, + "grad_norm": 0.8515625, + "learning_rate": 9.345483870967742e-05, + "loss": 0.173, + "step": 4559 + }, + { + "epoch": 0.07296, + "grad_norm": 0.65625, + "learning_rate": 9.345322580645162e-05, + "loss": 0.1606, + "step": 4560 + }, + { + "epoch": 0.072976, + "grad_norm": 1.0546875, + "learning_rate": 9.345161290322582e-05, + "loss": 0.2234, + "step": 4561 + }, + { + "epoch": 0.072992, + "grad_norm": 0.69921875, + "learning_rate": 9.345000000000001e-05, + "loss": 0.1717, + "step": 4562 + }, + { + "epoch": 0.073008, + "grad_norm": 0.703125, + "learning_rate": 9.34483870967742e-05, + "loss": 0.1693, + "step": 4563 + }, + { + "epoch": 0.073024, + "grad_norm": 0.70703125, + "learning_rate": 9.34467741935484e-05, + "loss": 0.2314, + "step": 4564 + }, + { + "epoch": 0.07304, + "grad_norm": 0.54296875, + "learning_rate": 9.344516129032258e-05, + "loss": 0.1517, + "step": 4565 + }, + { + "epoch": 0.073056, + "grad_norm": 0.8515625, + "learning_rate": 9.344354838709678e-05, + "loss": 0.2096, + "step": 4566 + }, + { + "epoch": 0.073072, + "grad_norm": 0.77734375, + "learning_rate": 9.344193548387097e-05, + "loss": 0.189, + "step": 4567 + }, + { + "epoch": 0.073088, + "grad_norm": 0.72265625, + "learning_rate": 9.344032258064517e-05, + "loss": 0.1903, + "step": 4568 + }, + { + "epoch": 0.073104, + "grad_norm": 0.7109375, + "learning_rate": 9.343870967741935e-05, + "loss": 0.1492, + "step": 4569 + }, + { + "epoch": 0.07312, + "grad_norm": 0.77734375, + "learning_rate": 9.343709677419355e-05, + "loss": 0.1787, + "step": 4570 + }, + { + "epoch": 0.073136, + "grad_norm": 0.91015625, + "learning_rate": 9.343548387096774e-05, + "loss": 0.199, + "step": 4571 + }, + { + "epoch": 0.073152, + "grad_norm": 0.7578125, + "learning_rate": 9.343387096774194e-05, + "loss": 0.144, + "step": 4572 + }, + { + "epoch": 0.073168, + "grad_norm": 0.796875, + "learning_rate": 9.343225806451614e-05, + "loss": 0.176, + "step": 4573 + }, + { + "epoch": 0.073184, + "grad_norm": 0.74609375, + "learning_rate": 9.343064516129034e-05, + "loss": 0.1669, + "step": 4574 + }, + { + "epoch": 0.0732, + "grad_norm": 0.890625, + "learning_rate": 9.342903225806452e-05, + "loss": 0.1979, + "step": 4575 + }, + { + "epoch": 0.073216, + "grad_norm": 0.69921875, + "learning_rate": 9.342741935483872e-05, + "loss": 0.2013, + "step": 4576 + }, + { + "epoch": 0.073232, + "grad_norm": 0.92578125, + "learning_rate": 9.342580645161291e-05, + "loss": 0.1988, + "step": 4577 + }, + { + "epoch": 0.073248, + "grad_norm": 1.125, + "learning_rate": 9.342419354838711e-05, + "loss": 0.1875, + "step": 4578 + }, + { + "epoch": 0.073264, + "grad_norm": 1.0234375, + "learning_rate": 9.34225806451613e-05, + "loss": 0.1825, + "step": 4579 + }, + { + "epoch": 0.07328, + "grad_norm": 0.8046875, + "learning_rate": 9.342096774193548e-05, + "loss": 0.1871, + "step": 4580 + }, + { + "epoch": 0.073296, + "grad_norm": 1.015625, + "learning_rate": 9.341935483870968e-05, + "loss": 0.2228, + "step": 4581 + }, + { + "epoch": 0.073312, + "grad_norm": 1.34375, + "learning_rate": 9.341774193548387e-05, + "loss": 0.1823, + "step": 4582 + }, + { + "epoch": 0.073328, + "grad_norm": 0.91015625, + "learning_rate": 9.341612903225806e-05, + "loss": 0.1632, + "step": 4583 + }, + { + "epoch": 0.073344, + "grad_norm": 0.734375, + "learning_rate": 9.341451612903226e-05, + "loss": 0.188, + "step": 4584 + }, + { + "epoch": 0.07336, + "grad_norm": 1.2265625, + "learning_rate": 9.341290322580646e-05, + "loss": 0.2005, + "step": 4585 + }, + { + "epoch": 0.073376, + "grad_norm": 0.890625, + "learning_rate": 9.341129032258065e-05, + "loss": 0.1827, + "step": 4586 + }, + { + "epoch": 0.073392, + "grad_norm": 0.72265625, + "learning_rate": 9.340967741935485e-05, + "loss": 0.2071, + "step": 4587 + }, + { + "epoch": 0.073408, + "grad_norm": 0.67578125, + "learning_rate": 9.340806451612904e-05, + "loss": 0.1989, + "step": 4588 + }, + { + "epoch": 0.073424, + "grad_norm": 1.03125, + "learning_rate": 9.340645161290324e-05, + "loss": 0.1822, + "step": 4589 + }, + { + "epoch": 0.07344, + "grad_norm": 0.8359375, + "learning_rate": 9.340483870967742e-05, + "loss": 0.155, + "step": 4590 + }, + { + "epoch": 0.073456, + "grad_norm": 1.0703125, + "learning_rate": 9.340322580645162e-05, + "loss": 0.2017, + "step": 4591 + }, + { + "epoch": 0.073472, + "grad_norm": 0.83984375, + "learning_rate": 9.340161290322581e-05, + "loss": 0.1786, + "step": 4592 + }, + { + "epoch": 0.073488, + "grad_norm": 1.0234375, + "learning_rate": 9.340000000000001e-05, + "loss": 0.1726, + "step": 4593 + }, + { + "epoch": 0.073504, + "grad_norm": 0.90625, + "learning_rate": 9.339838709677419e-05, + "loss": 0.1993, + "step": 4594 + }, + { + "epoch": 0.07352, + "grad_norm": 0.875, + "learning_rate": 9.339677419354839e-05, + "loss": 0.2094, + "step": 4595 + }, + { + "epoch": 0.073536, + "grad_norm": 0.73046875, + "learning_rate": 9.339516129032259e-05, + "loss": 0.1922, + "step": 4596 + }, + { + "epoch": 0.073552, + "grad_norm": 0.6796875, + "learning_rate": 9.339354838709678e-05, + "loss": 0.1659, + "step": 4597 + }, + { + "epoch": 0.073568, + "grad_norm": 0.9375, + "learning_rate": 9.339193548387098e-05, + "loss": 0.2211, + "step": 4598 + }, + { + "epoch": 0.073584, + "grad_norm": 0.7109375, + "learning_rate": 9.339032258064516e-05, + "loss": 0.1936, + "step": 4599 + }, + { + "epoch": 0.0736, + "grad_norm": 0.6875, + "learning_rate": 9.338870967741936e-05, + "loss": 0.1874, + "step": 4600 + }, + { + "epoch": 0.073616, + "grad_norm": 0.84375, + "learning_rate": 9.338709677419355e-05, + "loss": 0.1904, + "step": 4601 + }, + { + "epoch": 0.073632, + "grad_norm": 0.68359375, + "learning_rate": 9.338548387096775e-05, + "loss": 0.1364, + "step": 4602 + }, + { + "epoch": 0.073648, + "grad_norm": 0.87890625, + "learning_rate": 9.338387096774194e-05, + "loss": 0.1902, + "step": 4603 + }, + { + "epoch": 0.073664, + "grad_norm": 0.75, + "learning_rate": 9.338225806451614e-05, + "loss": 0.1782, + "step": 4604 + }, + { + "epoch": 0.07368, + "grad_norm": 1.1484375, + "learning_rate": 9.338064516129032e-05, + "loss": 0.1856, + "step": 4605 + }, + { + "epoch": 0.073696, + "grad_norm": 0.73828125, + "learning_rate": 9.337903225806452e-05, + "loss": 0.1617, + "step": 4606 + }, + { + "epoch": 0.073712, + "grad_norm": 0.70703125, + "learning_rate": 9.337741935483871e-05, + "loss": 0.1636, + "step": 4607 + }, + { + "epoch": 0.073728, + "grad_norm": 0.71875, + "learning_rate": 9.33758064516129e-05, + "loss": 0.1817, + "step": 4608 + }, + { + "epoch": 0.073744, + "grad_norm": 0.6953125, + "learning_rate": 9.33741935483871e-05, + "loss": 0.1948, + "step": 4609 + }, + { + "epoch": 0.07376, + "grad_norm": 0.77734375, + "learning_rate": 9.337258064516129e-05, + "loss": 0.1723, + "step": 4610 + }, + { + "epoch": 0.073776, + "grad_norm": 0.65625, + "learning_rate": 9.337096774193549e-05, + "loss": 0.1741, + "step": 4611 + }, + { + "epoch": 0.073792, + "grad_norm": 0.71875, + "learning_rate": 9.336935483870968e-05, + "loss": 0.1652, + "step": 4612 + }, + { + "epoch": 0.073808, + "grad_norm": 0.58203125, + "learning_rate": 9.336774193548388e-05, + "loss": 0.1853, + "step": 4613 + }, + { + "epoch": 0.073824, + "grad_norm": 0.921875, + "learning_rate": 9.336612903225806e-05, + "loss": 0.1697, + "step": 4614 + }, + { + "epoch": 0.07384, + "grad_norm": 0.53125, + "learning_rate": 9.336451612903226e-05, + "loss": 0.1495, + "step": 4615 + }, + { + "epoch": 0.073856, + "grad_norm": 1.1875, + "learning_rate": 9.336290322580645e-05, + "loss": 0.1647, + "step": 4616 + }, + { + "epoch": 0.073872, + "grad_norm": 0.94140625, + "learning_rate": 9.336129032258065e-05, + "loss": 0.2278, + "step": 4617 + }, + { + "epoch": 0.073888, + "grad_norm": 0.75, + "learning_rate": 9.335967741935484e-05, + "loss": 0.154, + "step": 4618 + }, + { + "epoch": 0.073904, + "grad_norm": 0.78515625, + "learning_rate": 9.335806451612903e-05, + "loss": 0.2128, + "step": 4619 + }, + { + "epoch": 0.07392, + "grad_norm": 0.66015625, + "learning_rate": 9.335645161290323e-05, + "loss": 0.1823, + "step": 4620 + }, + { + "epoch": 0.073936, + "grad_norm": 0.87890625, + "learning_rate": 9.335483870967743e-05, + "loss": 0.2009, + "step": 4621 + }, + { + "epoch": 0.073952, + "grad_norm": 1.265625, + "learning_rate": 9.335322580645162e-05, + "loss": 0.1963, + "step": 4622 + }, + { + "epoch": 0.073968, + "grad_norm": 1.140625, + "learning_rate": 9.335161290322582e-05, + "loss": 0.1692, + "step": 4623 + }, + { + "epoch": 0.073984, + "grad_norm": 0.97265625, + "learning_rate": 9.335e-05, + "loss": 0.2426, + "step": 4624 + }, + { + "epoch": 0.074, + "grad_norm": 0.79296875, + "learning_rate": 9.334838709677419e-05, + "loss": 0.1941, + "step": 4625 + }, + { + "epoch": 0.074016, + "grad_norm": 0.7421875, + "learning_rate": 9.334677419354839e-05, + "loss": 0.1595, + "step": 4626 + }, + { + "epoch": 0.074032, + "grad_norm": 0.6171875, + "learning_rate": 9.334516129032258e-05, + "loss": 0.1581, + "step": 4627 + }, + { + "epoch": 0.074048, + "grad_norm": 0.59375, + "learning_rate": 9.334354838709678e-05, + "loss": 0.1735, + "step": 4628 + }, + { + "epoch": 0.074064, + "grad_norm": 1.4921875, + "learning_rate": 9.334193548387096e-05, + "loss": 0.1462, + "step": 4629 + }, + { + "epoch": 0.07408, + "grad_norm": 1.59375, + "learning_rate": 9.334032258064516e-05, + "loss": 0.1988, + "step": 4630 + }, + { + "epoch": 0.074096, + "grad_norm": 1.4453125, + "learning_rate": 9.333870967741936e-05, + "loss": 0.1879, + "step": 4631 + }, + { + "epoch": 0.074112, + "grad_norm": 0.92578125, + "learning_rate": 9.333709677419355e-05, + "loss": 0.2199, + "step": 4632 + }, + { + "epoch": 0.074128, + "grad_norm": 0.75390625, + "learning_rate": 9.333548387096775e-05, + "loss": 0.1624, + "step": 4633 + }, + { + "epoch": 0.074144, + "grad_norm": 0.9609375, + "learning_rate": 9.333387096774195e-05, + "loss": 0.1633, + "step": 4634 + }, + { + "epoch": 0.07416, + "grad_norm": 0.6875, + "learning_rate": 9.333225806451613e-05, + "loss": 0.2088, + "step": 4635 + }, + { + "epoch": 0.074176, + "grad_norm": 1.6171875, + "learning_rate": 9.333064516129033e-05, + "loss": 0.236, + "step": 4636 + }, + { + "epoch": 0.074192, + "grad_norm": 0.875, + "learning_rate": 9.332903225806452e-05, + "loss": 0.1493, + "step": 4637 + }, + { + "epoch": 0.074208, + "grad_norm": 0.55078125, + "learning_rate": 9.332741935483872e-05, + "loss": 0.16, + "step": 4638 + }, + { + "epoch": 0.074224, + "grad_norm": 0.984375, + "learning_rate": 9.33258064516129e-05, + "loss": 0.1636, + "step": 4639 + }, + { + "epoch": 0.07424, + "grad_norm": 1.1953125, + "learning_rate": 9.33241935483871e-05, + "loss": 0.1596, + "step": 4640 + }, + { + "epoch": 0.074256, + "grad_norm": 1.203125, + "learning_rate": 9.332258064516129e-05, + "loss": 0.2003, + "step": 4641 + }, + { + "epoch": 0.074272, + "grad_norm": 0.6484375, + "learning_rate": 9.332096774193548e-05, + "loss": 0.1649, + "step": 4642 + }, + { + "epoch": 0.074288, + "grad_norm": 0.83203125, + "learning_rate": 9.331935483870968e-05, + "loss": 0.1689, + "step": 4643 + }, + { + "epoch": 0.074304, + "grad_norm": 0.765625, + "learning_rate": 9.331774193548388e-05, + "loss": 0.2065, + "step": 4644 + }, + { + "epoch": 0.07432, + "grad_norm": 0.69140625, + "learning_rate": 9.331612903225808e-05, + "loss": 0.1727, + "step": 4645 + }, + { + "epoch": 0.074336, + "grad_norm": 0.90234375, + "learning_rate": 9.331451612903226e-05, + "loss": 0.1815, + "step": 4646 + }, + { + "epoch": 0.074352, + "grad_norm": 0.9296875, + "learning_rate": 9.331290322580646e-05, + "loss": 0.1802, + "step": 4647 + }, + { + "epoch": 0.074368, + "grad_norm": 0.875, + "learning_rate": 9.331129032258065e-05, + "loss": 0.1531, + "step": 4648 + }, + { + "epoch": 0.074384, + "grad_norm": 1.0078125, + "learning_rate": 9.330967741935485e-05, + "loss": 0.1891, + "step": 4649 + }, + { + "epoch": 0.0744, + "grad_norm": 1.0078125, + "learning_rate": 9.330806451612903e-05, + "loss": 0.1562, + "step": 4650 + }, + { + "epoch": 0.074416, + "grad_norm": 0.89453125, + "learning_rate": 9.330645161290323e-05, + "loss": 0.1991, + "step": 4651 + }, + { + "epoch": 0.074432, + "grad_norm": 1.453125, + "learning_rate": 9.330483870967742e-05, + "loss": 0.2165, + "step": 4652 + }, + { + "epoch": 0.074448, + "grad_norm": 1.0, + "learning_rate": 9.330322580645162e-05, + "loss": 0.1784, + "step": 4653 + }, + { + "epoch": 0.074464, + "grad_norm": 0.75, + "learning_rate": 9.33016129032258e-05, + "loss": 0.1589, + "step": 4654 + }, + { + "epoch": 0.07448, + "grad_norm": 0.6953125, + "learning_rate": 9.33e-05, + "loss": 0.1707, + "step": 4655 + }, + { + "epoch": 0.074496, + "grad_norm": 0.61328125, + "learning_rate": 9.32983870967742e-05, + "loss": 0.1543, + "step": 4656 + }, + { + "epoch": 0.074512, + "grad_norm": 0.56640625, + "learning_rate": 9.329677419354839e-05, + "loss": 0.1491, + "step": 4657 + }, + { + "epoch": 0.074528, + "grad_norm": 1.140625, + "learning_rate": 9.329516129032259e-05, + "loss": 0.1553, + "step": 4658 + }, + { + "epoch": 0.074544, + "grad_norm": 0.66015625, + "learning_rate": 9.329354838709678e-05, + "loss": 0.2013, + "step": 4659 + }, + { + "epoch": 0.07456, + "grad_norm": 1.2578125, + "learning_rate": 9.329193548387098e-05, + "loss": 0.2157, + "step": 4660 + }, + { + "epoch": 0.074576, + "grad_norm": 1.203125, + "learning_rate": 9.329032258064516e-05, + "loss": 0.1668, + "step": 4661 + }, + { + "epoch": 0.074592, + "grad_norm": 1.0703125, + "learning_rate": 9.328870967741936e-05, + "loss": 0.1732, + "step": 4662 + }, + { + "epoch": 0.074608, + "grad_norm": 0.5859375, + "learning_rate": 9.328709677419355e-05, + "loss": 0.1707, + "step": 4663 + }, + { + "epoch": 0.074624, + "grad_norm": 0.94140625, + "learning_rate": 9.328548387096775e-05, + "loss": 0.2123, + "step": 4664 + }, + { + "epoch": 0.07464, + "grad_norm": 0.80078125, + "learning_rate": 9.328387096774193e-05, + "loss": 0.1599, + "step": 4665 + }, + { + "epoch": 0.074656, + "grad_norm": 0.5703125, + "learning_rate": 9.328225806451613e-05, + "loss": 0.1399, + "step": 4666 + }, + { + "epoch": 0.074672, + "grad_norm": 0.83984375, + "learning_rate": 9.328064516129032e-05, + "loss": 0.1929, + "step": 4667 + }, + { + "epoch": 0.074688, + "grad_norm": 0.65625, + "learning_rate": 9.327903225806452e-05, + "loss": 0.2044, + "step": 4668 + }, + { + "epoch": 0.074704, + "grad_norm": 0.86328125, + "learning_rate": 9.327741935483872e-05, + "loss": 0.1387, + "step": 4669 + }, + { + "epoch": 0.07472, + "grad_norm": 0.69921875, + "learning_rate": 9.327580645161292e-05, + "loss": 0.1473, + "step": 4670 + }, + { + "epoch": 0.074736, + "grad_norm": 0.80078125, + "learning_rate": 9.32741935483871e-05, + "loss": 0.1751, + "step": 4671 + }, + { + "epoch": 0.074752, + "grad_norm": 0.6953125, + "learning_rate": 9.327258064516129e-05, + "loss": 0.1989, + "step": 4672 + }, + { + "epoch": 0.074768, + "grad_norm": 0.89453125, + "learning_rate": 9.327096774193549e-05, + "loss": 0.1955, + "step": 4673 + }, + { + "epoch": 0.074784, + "grad_norm": 1.3671875, + "learning_rate": 9.326935483870968e-05, + "loss": 0.1862, + "step": 4674 + }, + { + "epoch": 0.0748, + "grad_norm": 0.98046875, + "learning_rate": 9.326774193548388e-05, + "loss": 0.1821, + "step": 4675 + }, + { + "epoch": 0.074816, + "grad_norm": 0.875, + "learning_rate": 9.326612903225806e-05, + "loss": 0.1621, + "step": 4676 + }, + { + "epoch": 0.074832, + "grad_norm": 0.6640625, + "learning_rate": 9.326451612903226e-05, + "loss": 0.1848, + "step": 4677 + }, + { + "epoch": 0.074848, + "grad_norm": 1.015625, + "learning_rate": 9.326290322580645e-05, + "loss": 0.1467, + "step": 4678 + }, + { + "epoch": 0.074864, + "grad_norm": 0.7265625, + "learning_rate": 9.326129032258065e-05, + "loss": 0.1638, + "step": 4679 + }, + { + "epoch": 0.07488, + "grad_norm": 0.78515625, + "learning_rate": 9.325967741935485e-05, + "loss": 0.1715, + "step": 4680 + }, + { + "epoch": 0.074896, + "grad_norm": 1.0390625, + "learning_rate": 9.325806451612905e-05, + "loss": 0.1966, + "step": 4681 + }, + { + "epoch": 0.074912, + "grad_norm": 0.875, + "learning_rate": 9.325645161290323e-05, + "loss": 0.2078, + "step": 4682 + }, + { + "epoch": 0.074928, + "grad_norm": 0.7109375, + "learning_rate": 9.325483870967743e-05, + "loss": 0.1444, + "step": 4683 + }, + { + "epoch": 0.074944, + "grad_norm": 0.765625, + "learning_rate": 9.325322580645162e-05, + "loss": 0.1652, + "step": 4684 + }, + { + "epoch": 0.07496, + "grad_norm": 0.8046875, + "learning_rate": 9.325161290322582e-05, + "loss": 0.1685, + "step": 4685 + }, + { + "epoch": 0.074976, + "grad_norm": 0.72265625, + "learning_rate": 9.325e-05, + "loss": 0.1957, + "step": 4686 + }, + { + "epoch": 0.074992, + "grad_norm": 0.78515625, + "learning_rate": 9.32483870967742e-05, + "loss": 0.1846, + "step": 4687 + }, + { + "epoch": 0.075008, + "grad_norm": 1.1640625, + "learning_rate": 9.324677419354839e-05, + "loss": 0.2039, + "step": 4688 + }, + { + "epoch": 0.075024, + "grad_norm": 0.69140625, + "learning_rate": 9.324516129032258e-05, + "loss": 0.1726, + "step": 4689 + }, + { + "epoch": 0.07504, + "grad_norm": 1.1796875, + "learning_rate": 9.324354838709677e-05, + "loss": 0.1926, + "step": 4690 + }, + { + "epoch": 0.075056, + "grad_norm": 0.9765625, + "learning_rate": 9.324193548387097e-05, + "loss": 0.182, + "step": 4691 + }, + { + "epoch": 0.075072, + "grad_norm": 1.1015625, + "learning_rate": 9.324032258064517e-05, + "loss": 0.1714, + "step": 4692 + }, + { + "epoch": 0.075088, + "grad_norm": 0.82421875, + "learning_rate": 9.323870967741936e-05, + "loss": 0.1954, + "step": 4693 + }, + { + "epoch": 0.075104, + "grad_norm": 1.359375, + "learning_rate": 9.323709677419356e-05, + "loss": 0.1825, + "step": 4694 + }, + { + "epoch": 0.07512, + "grad_norm": 1.109375, + "learning_rate": 9.323548387096775e-05, + "loss": 0.2363, + "step": 4695 + }, + { + "epoch": 0.075136, + "grad_norm": 1.0234375, + "learning_rate": 9.323387096774195e-05, + "loss": 0.2315, + "step": 4696 + }, + { + "epoch": 0.075152, + "grad_norm": 1.1171875, + "learning_rate": 9.323225806451613e-05, + "loss": 0.2203, + "step": 4697 + }, + { + "epoch": 0.075168, + "grad_norm": 0.5625, + "learning_rate": 9.323064516129033e-05, + "loss": 0.1363, + "step": 4698 + }, + { + "epoch": 0.075184, + "grad_norm": 0.6171875, + "learning_rate": 9.322903225806452e-05, + "loss": 0.1862, + "step": 4699 + }, + { + "epoch": 0.0752, + "grad_norm": 1.2734375, + "learning_rate": 9.322741935483872e-05, + "loss": 0.1647, + "step": 4700 + }, + { + "epoch": 0.075216, + "grad_norm": 0.7265625, + "learning_rate": 9.32258064516129e-05, + "loss": 0.1772, + "step": 4701 + }, + { + "epoch": 0.075232, + "grad_norm": 1.0078125, + "learning_rate": 9.32241935483871e-05, + "loss": 0.2046, + "step": 4702 + }, + { + "epoch": 0.075248, + "grad_norm": 0.96875, + "learning_rate": 9.322258064516129e-05, + "loss": 0.1837, + "step": 4703 + }, + { + "epoch": 0.075264, + "grad_norm": 0.50390625, + "learning_rate": 9.322096774193549e-05, + "loss": 0.142, + "step": 4704 + }, + { + "epoch": 0.07528, + "grad_norm": 0.6171875, + "learning_rate": 9.321935483870969e-05, + "loss": 0.1593, + "step": 4705 + }, + { + "epoch": 0.075296, + "grad_norm": 0.74609375, + "learning_rate": 9.321774193548387e-05, + "loss": 0.1908, + "step": 4706 + }, + { + "epoch": 0.075312, + "grad_norm": 0.8984375, + "learning_rate": 9.321612903225807e-05, + "loss": 0.2418, + "step": 4707 + }, + { + "epoch": 0.075328, + "grad_norm": 0.71484375, + "learning_rate": 9.321451612903226e-05, + "loss": 0.1985, + "step": 4708 + }, + { + "epoch": 0.075344, + "grad_norm": 0.74609375, + "learning_rate": 9.321290322580646e-05, + "loss": 0.156, + "step": 4709 + }, + { + "epoch": 0.07536, + "grad_norm": 0.99609375, + "learning_rate": 9.321129032258065e-05, + "loss": 0.2155, + "step": 4710 + }, + { + "epoch": 0.075376, + "grad_norm": 1.25, + "learning_rate": 9.320967741935485e-05, + "loss": 0.1856, + "step": 4711 + }, + { + "epoch": 0.075392, + "grad_norm": 0.734375, + "learning_rate": 9.320806451612903e-05, + "loss": 0.1527, + "step": 4712 + }, + { + "epoch": 0.075408, + "grad_norm": 0.7265625, + "learning_rate": 9.320645161290323e-05, + "loss": 0.1764, + "step": 4713 + }, + { + "epoch": 0.075424, + "grad_norm": 0.81640625, + "learning_rate": 9.320483870967742e-05, + "loss": 0.1818, + "step": 4714 + }, + { + "epoch": 0.07544, + "grad_norm": 0.9140625, + "learning_rate": 9.320322580645162e-05, + "loss": 0.2455, + "step": 4715 + }, + { + "epoch": 0.075456, + "grad_norm": 1.0625, + "learning_rate": 9.320161290322582e-05, + "loss": 0.2275, + "step": 4716 + }, + { + "epoch": 0.075472, + "grad_norm": 0.50390625, + "learning_rate": 9.320000000000002e-05, + "loss": 0.1375, + "step": 4717 + }, + { + "epoch": 0.075488, + "grad_norm": 0.86328125, + "learning_rate": 9.31983870967742e-05, + "loss": 0.2169, + "step": 4718 + }, + { + "epoch": 0.075504, + "grad_norm": 1.2265625, + "learning_rate": 9.319677419354839e-05, + "loss": 0.1941, + "step": 4719 + }, + { + "epoch": 0.07552, + "grad_norm": 1.3125, + "learning_rate": 9.319516129032259e-05, + "loss": 0.2286, + "step": 4720 + }, + { + "epoch": 0.075536, + "grad_norm": 0.8359375, + "learning_rate": 9.319354838709677e-05, + "loss": 0.1974, + "step": 4721 + }, + { + "epoch": 0.075552, + "grad_norm": 0.5390625, + "learning_rate": 9.319193548387097e-05, + "loss": 0.184, + "step": 4722 + }, + { + "epoch": 0.075568, + "grad_norm": 0.546875, + "learning_rate": 9.319032258064516e-05, + "loss": 0.1945, + "step": 4723 + }, + { + "epoch": 0.075584, + "grad_norm": 0.43359375, + "learning_rate": 9.318870967741936e-05, + "loss": 0.1473, + "step": 4724 + }, + { + "epoch": 0.0756, + "grad_norm": 1.8125, + "learning_rate": 9.318709677419354e-05, + "loss": 0.2267, + "step": 4725 + }, + { + "epoch": 0.075616, + "grad_norm": 0.6875, + "learning_rate": 9.318548387096774e-05, + "loss": 0.1627, + "step": 4726 + }, + { + "epoch": 0.075632, + "grad_norm": 1.15625, + "learning_rate": 9.318387096774194e-05, + "loss": 0.1557, + "step": 4727 + }, + { + "epoch": 0.075648, + "grad_norm": 1.0390625, + "learning_rate": 9.318225806451613e-05, + "loss": 0.1975, + "step": 4728 + }, + { + "epoch": 0.075664, + "grad_norm": 1.5703125, + "learning_rate": 9.318064516129033e-05, + "loss": 0.1878, + "step": 4729 + }, + { + "epoch": 0.07568, + "grad_norm": 0.8203125, + "learning_rate": 9.317903225806453e-05, + "loss": 0.2087, + "step": 4730 + }, + { + "epoch": 0.075696, + "grad_norm": 1.0078125, + "learning_rate": 9.317741935483872e-05, + "loss": 0.2388, + "step": 4731 + }, + { + "epoch": 0.075712, + "grad_norm": 0.7578125, + "learning_rate": 9.317580645161292e-05, + "loss": 0.1859, + "step": 4732 + }, + { + "epoch": 0.075728, + "grad_norm": 0.64453125, + "learning_rate": 9.31741935483871e-05, + "loss": 0.1494, + "step": 4733 + }, + { + "epoch": 0.075744, + "grad_norm": 0.6640625, + "learning_rate": 9.317258064516129e-05, + "loss": 0.1714, + "step": 4734 + }, + { + "epoch": 0.07576, + "grad_norm": 0.91796875, + "learning_rate": 9.317096774193549e-05, + "loss": 0.1579, + "step": 4735 + }, + { + "epoch": 0.075776, + "grad_norm": 1.0, + "learning_rate": 9.316935483870967e-05, + "loss": 0.2376, + "step": 4736 + }, + { + "epoch": 0.075792, + "grad_norm": 0.6875, + "learning_rate": 9.316774193548387e-05, + "loss": 0.1663, + "step": 4737 + }, + { + "epoch": 0.075808, + "grad_norm": 0.56640625, + "learning_rate": 9.316612903225806e-05, + "loss": 0.1582, + "step": 4738 + }, + { + "epoch": 0.075824, + "grad_norm": 0.71484375, + "learning_rate": 9.316451612903226e-05, + "loss": 0.1658, + "step": 4739 + }, + { + "epoch": 0.07584, + "grad_norm": 0.921875, + "learning_rate": 9.316290322580646e-05, + "loss": 0.1641, + "step": 4740 + }, + { + "epoch": 0.075856, + "grad_norm": 0.63671875, + "learning_rate": 9.316129032258066e-05, + "loss": 0.1643, + "step": 4741 + }, + { + "epoch": 0.075872, + "grad_norm": 1.1640625, + "learning_rate": 9.315967741935484e-05, + "loss": 0.1467, + "step": 4742 + }, + { + "epoch": 0.075888, + "grad_norm": 0.7734375, + "learning_rate": 9.315806451612904e-05, + "loss": 0.1622, + "step": 4743 + }, + { + "epoch": 0.075904, + "grad_norm": 0.67578125, + "learning_rate": 9.315645161290323e-05, + "loss": 0.167, + "step": 4744 + }, + { + "epoch": 0.07592, + "grad_norm": 0.83203125, + "learning_rate": 9.315483870967743e-05, + "loss": 0.191, + "step": 4745 + }, + { + "epoch": 0.075936, + "grad_norm": 0.703125, + "learning_rate": 9.315322580645162e-05, + "loss": 0.2011, + "step": 4746 + }, + { + "epoch": 0.075952, + "grad_norm": 1.0859375, + "learning_rate": 9.315161290322581e-05, + "loss": 0.2075, + "step": 4747 + }, + { + "epoch": 0.075968, + "grad_norm": 0.78125, + "learning_rate": 9.315e-05, + "loss": 0.1612, + "step": 4748 + }, + { + "epoch": 0.075984, + "grad_norm": 0.9453125, + "learning_rate": 9.31483870967742e-05, + "loss": 0.2041, + "step": 4749 + }, + { + "epoch": 0.076, + "grad_norm": 0.94140625, + "learning_rate": 9.314677419354839e-05, + "loss": 0.2054, + "step": 4750 + }, + { + "epoch": 0.076016, + "grad_norm": 0.984375, + "learning_rate": 9.314516129032259e-05, + "loss": 0.2365, + "step": 4751 + }, + { + "epoch": 0.076032, + "grad_norm": 0.671875, + "learning_rate": 9.314354838709679e-05, + "loss": 0.1401, + "step": 4752 + }, + { + "epoch": 0.076048, + "grad_norm": 1.6015625, + "learning_rate": 9.314193548387097e-05, + "loss": 0.2147, + "step": 4753 + }, + { + "epoch": 0.076064, + "grad_norm": 0.62109375, + "learning_rate": 9.314032258064517e-05, + "loss": 0.1823, + "step": 4754 + }, + { + "epoch": 0.07608, + "grad_norm": 0.7421875, + "learning_rate": 9.313870967741936e-05, + "loss": 0.2285, + "step": 4755 + }, + { + "epoch": 0.076096, + "grad_norm": 0.69140625, + "learning_rate": 9.313709677419356e-05, + "loss": 0.1708, + "step": 4756 + }, + { + "epoch": 0.076112, + "grad_norm": 0.80078125, + "learning_rate": 9.313548387096774e-05, + "loss": 0.1576, + "step": 4757 + }, + { + "epoch": 0.076128, + "grad_norm": 1.0390625, + "learning_rate": 9.313387096774194e-05, + "loss": 0.1992, + "step": 4758 + }, + { + "epoch": 0.076144, + "grad_norm": 1.046875, + "learning_rate": 9.313225806451613e-05, + "loss": 0.1929, + "step": 4759 + }, + { + "epoch": 0.07616, + "grad_norm": 0.625, + "learning_rate": 9.313064516129033e-05, + "loss": 0.1505, + "step": 4760 + }, + { + "epoch": 0.076176, + "grad_norm": 0.6328125, + "learning_rate": 9.312903225806451e-05, + "loss": 0.1728, + "step": 4761 + }, + { + "epoch": 0.076192, + "grad_norm": 0.79296875, + "learning_rate": 9.312741935483871e-05, + "loss": 0.1763, + "step": 4762 + }, + { + "epoch": 0.076208, + "grad_norm": 0.82421875, + "learning_rate": 9.31258064516129e-05, + "loss": 0.1945, + "step": 4763 + }, + { + "epoch": 0.076224, + "grad_norm": 1.0859375, + "learning_rate": 9.31241935483871e-05, + "loss": 0.1673, + "step": 4764 + }, + { + "epoch": 0.07624, + "grad_norm": 0.8125, + "learning_rate": 9.31225806451613e-05, + "loss": 0.1842, + "step": 4765 + }, + { + "epoch": 0.076256, + "grad_norm": 0.7578125, + "learning_rate": 9.312096774193549e-05, + "loss": 0.1824, + "step": 4766 + }, + { + "epoch": 0.076272, + "grad_norm": 0.90234375, + "learning_rate": 9.311935483870969e-05, + "loss": 0.1722, + "step": 4767 + }, + { + "epoch": 0.076288, + "grad_norm": 0.6484375, + "learning_rate": 9.311774193548387e-05, + "loss": 0.1608, + "step": 4768 + }, + { + "epoch": 0.076304, + "grad_norm": 0.6640625, + "learning_rate": 9.311612903225807e-05, + "loss": 0.162, + "step": 4769 + }, + { + "epoch": 0.07632, + "grad_norm": 1.0625, + "learning_rate": 9.311451612903226e-05, + "loss": 0.1653, + "step": 4770 + }, + { + "epoch": 0.076336, + "grad_norm": 0.9140625, + "learning_rate": 9.311290322580646e-05, + "loss": 0.2163, + "step": 4771 + }, + { + "epoch": 0.076352, + "grad_norm": 0.9609375, + "learning_rate": 9.311129032258064e-05, + "loss": 0.1677, + "step": 4772 + }, + { + "epoch": 0.076368, + "grad_norm": 0.765625, + "learning_rate": 9.310967741935484e-05, + "loss": 0.2009, + "step": 4773 + }, + { + "epoch": 0.076384, + "grad_norm": 0.625, + "learning_rate": 9.310806451612903e-05, + "loss": 0.1761, + "step": 4774 + }, + { + "epoch": 0.0764, + "grad_norm": 0.859375, + "learning_rate": 9.310645161290323e-05, + "loss": 0.1478, + "step": 4775 + }, + { + "epoch": 0.076416, + "grad_norm": 0.7578125, + "learning_rate": 9.310483870967743e-05, + "loss": 0.1888, + "step": 4776 + }, + { + "epoch": 0.076432, + "grad_norm": 0.765625, + "learning_rate": 9.310322580645163e-05, + "loss": 0.1774, + "step": 4777 + }, + { + "epoch": 0.076448, + "grad_norm": 1.09375, + "learning_rate": 9.310161290322581e-05, + "loss": 0.1959, + "step": 4778 + }, + { + "epoch": 0.076464, + "grad_norm": 0.71875, + "learning_rate": 9.310000000000001e-05, + "loss": 0.1645, + "step": 4779 + }, + { + "epoch": 0.07648, + "grad_norm": 0.703125, + "learning_rate": 9.30983870967742e-05, + "loss": 0.1392, + "step": 4780 + }, + { + "epoch": 0.076496, + "grad_norm": 0.83984375, + "learning_rate": 9.309677419354839e-05, + "loss": 0.1191, + "step": 4781 + }, + { + "epoch": 0.076512, + "grad_norm": 1.0234375, + "learning_rate": 9.309516129032259e-05, + "loss": 0.2145, + "step": 4782 + }, + { + "epoch": 0.076528, + "grad_norm": 0.82421875, + "learning_rate": 9.309354838709677e-05, + "loss": 0.1779, + "step": 4783 + }, + { + "epoch": 0.076544, + "grad_norm": 1.1015625, + "learning_rate": 9.309193548387097e-05, + "loss": 0.182, + "step": 4784 + }, + { + "epoch": 0.07656, + "grad_norm": 0.8828125, + "learning_rate": 9.309032258064516e-05, + "loss": 0.1931, + "step": 4785 + }, + { + "epoch": 0.076576, + "grad_norm": 0.9765625, + "learning_rate": 9.308870967741936e-05, + "loss": 0.2001, + "step": 4786 + }, + { + "epoch": 0.076592, + "grad_norm": 1.0234375, + "learning_rate": 9.308709677419356e-05, + "loss": 0.1654, + "step": 4787 + }, + { + "epoch": 0.076608, + "grad_norm": 0.478515625, + "learning_rate": 9.308548387096776e-05, + "loss": 0.1213, + "step": 4788 + }, + { + "epoch": 0.076624, + "grad_norm": 0.71875, + "learning_rate": 9.308387096774194e-05, + "loss": 0.188, + "step": 4789 + }, + { + "epoch": 0.07664, + "grad_norm": 0.78125, + "learning_rate": 9.308225806451614e-05, + "loss": 0.2099, + "step": 4790 + }, + { + "epoch": 0.076656, + "grad_norm": 0.75390625, + "learning_rate": 9.308064516129033e-05, + "loss": 0.1688, + "step": 4791 + }, + { + "epoch": 0.076672, + "grad_norm": 0.6328125, + "learning_rate": 9.307903225806453e-05, + "loss": 0.184, + "step": 4792 + }, + { + "epoch": 0.076688, + "grad_norm": 0.87890625, + "learning_rate": 9.307741935483871e-05, + "loss": 0.1525, + "step": 4793 + }, + { + "epoch": 0.076704, + "grad_norm": 1.46875, + "learning_rate": 9.307580645161291e-05, + "loss": 0.1694, + "step": 4794 + }, + { + "epoch": 0.07672, + "grad_norm": 0.68359375, + "learning_rate": 9.30741935483871e-05, + "loss": 0.1758, + "step": 4795 + }, + { + "epoch": 0.076736, + "grad_norm": 0.65234375, + "learning_rate": 9.30725806451613e-05, + "loss": 0.1636, + "step": 4796 + }, + { + "epoch": 0.076752, + "grad_norm": 0.90234375, + "learning_rate": 9.307096774193548e-05, + "loss": 0.1837, + "step": 4797 + }, + { + "epoch": 0.076768, + "grad_norm": 1.09375, + "learning_rate": 9.306935483870967e-05, + "loss": 0.1653, + "step": 4798 + }, + { + "epoch": 0.076784, + "grad_norm": 1.015625, + "learning_rate": 9.306774193548387e-05, + "loss": 0.1795, + "step": 4799 + }, + { + "epoch": 0.0768, + "grad_norm": 0.7421875, + "learning_rate": 9.306612903225807e-05, + "loss": 0.1605, + "step": 4800 + }, + { + "epoch": 0.076816, + "grad_norm": 0.80078125, + "learning_rate": 9.306451612903227e-05, + "loss": 0.1923, + "step": 4801 + }, + { + "epoch": 0.076832, + "grad_norm": 0.93359375, + "learning_rate": 9.306290322580646e-05, + "loss": 0.1753, + "step": 4802 + }, + { + "epoch": 0.076848, + "grad_norm": 0.9609375, + "learning_rate": 9.306129032258066e-05, + "loss": 0.2425, + "step": 4803 + }, + { + "epoch": 0.076864, + "grad_norm": 0.81640625, + "learning_rate": 9.305967741935484e-05, + "loss": 0.1841, + "step": 4804 + }, + { + "epoch": 0.07688, + "grad_norm": 1.21875, + "learning_rate": 9.305806451612904e-05, + "loss": 0.1518, + "step": 4805 + }, + { + "epoch": 0.076896, + "grad_norm": 0.82421875, + "learning_rate": 9.305645161290323e-05, + "loss": 0.1893, + "step": 4806 + }, + { + "epoch": 0.076912, + "grad_norm": 0.76953125, + "learning_rate": 9.305483870967743e-05, + "loss": 0.1375, + "step": 4807 + }, + { + "epoch": 0.076928, + "grad_norm": 0.80078125, + "learning_rate": 9.305322580645161e-05, + "loss": 0.1702, + "step": 4808 + }, + { + "epoch": 0.076944, + "grad_norm": 1.0234375, + "learning_rate": 9.305161290322581e-05, + "loss": 0.2316, + "step": 4809 + }, + { + "epoch": 0.07696, + "grad_norm": 1.0703125, + "learning_rate": 9.305e-05, + "loss": 0.1934, + "step": 4810 + }, + { + "epoch": 0.076976, + "grad_norm": 0.9453125, + "learning_rate": 9.30483870967742e-05, + "loss": 0.2079, + "step": 4811 + }, + { + "epoch": 0.076992, + "grad_norm": 0.8203125, + "learning_rate": 9.30467741935484e-05, + "loss": 0.1762, + "step": 4812 + }, + { + "epoch": 0.077008, + "grad_norm": 1.1484375, + "learning_rate": 9.304516129032258e-05, + "loss": 0.1849, + "step": 4813 + }, + { + "epoch": 0.077024, + "grad_norm": 0.83984375, + "learning_rate": 9.304354838709678e-05, + "loss": 0.1728, + "step": 4814 + }, + { + "epoch": 0.07704, + "grad_norm": 0.96875, + "learning_rate": 9.304193548387097e-05, + "loss": 0.1896, + "step": 4815 + }, + { + "epoch": 0.077056, + "grad_norm": 0.765625, + "learning_rate": 9.304032258064517e-05, + "loss": 0.1732, + "step": 4816 + }, + { + "epoch": 0.077072, + "grad_norm": 0.75, + "learning_rate": 9.303870967741936e-05, + "loss": 0.2081, + "step": 4817 + }, + { + "epoch": 0.077088, + "grad_norm": 1.0546875, + "learning_rate": 9.303709677419355e-05, + "loss": 0.2132, + "step": 4818 + }, + { + "epoch": 0.077104, + "grad_norm": 1.0078125, + "learning_rate": 9.303548387096774e-05, + "loss": 0.1929, + "step": 4819 + }, + { + "epoch": 0.07712, + "grad_norm": 0.81640625, + "learning_rate": 9.303387096774194e-05, + "loss": 0.2153, + "step": 4820 + }, + { + "epoch": 0.077136, + "grad_norm": 0.859375, + "learning_rate": 9.303225806451613e-05, + "loss": 0.1912, + "step": 4821 + }, + { + "epoch": 0.077152, + "grad_norm": 0.671875, + "learning_rate": 9.303064516129033e-05, + "loss": 0.1847, + "step": 4822 + }, + { + "epoch": 0.077168, + "grad_norm": 0.875, + "learning_rate": 9.302903225806451e-05, + "loss": 0.1624, + "step": 4823 + }, + { + "epoch": 0.077184, + "grad_norm": 0.63671875, + "learning_rate": 9.302741935483871e-05, + "loss": 0.1798, + "step": 4824 + }, + { + "epoch": 0.0772, + "grad_norm": 0.6484375, + "learning_rate": 9.302580645161291e-05, + "loss": 0.1658, + "step": 4825 + }, + { + "epoch": 0.077216, + "grad_norm": 0.59375, + "learning_rate": 9.302419354838711e-05, + "loss": 0.207, + "step": 4826 + }, + { + "epoch": 0.077232, + "grad_norm": 0.76171875, + "learning_rate": 9.30225806451613e-05, + "loss": 0.1968, + "step": 4827 + }, + { + "epoch": 0.077248, + "grad_norm": 1.015625, + "learning_rate": 9.302096774193548e-05, + "loss": 0.2071, + "step": 4828 + }, + { + "epoch": 0.077264, + "grad_norm": 1.3046875, + "learning_rate": 9.301935483870968e-05, + "loss": 0.241, + "step": 4829 + }, + { + "epoch": 0.07728, + "grad_norm": 1.0703125, + "learning_rate": 9.301774193548387e-05, + "loss": 0.1746, + "step": 4830 + }, + { + "epoch": 0.077296, + "grad_norm": 0.5, + "learning_rate": 9.301612903225807e-05, + "loss": 0.1737, + "step": 4831 + }, + { + "epoch": 0.077312, + "grad_norm": 0.796875, + "learning_rate": 9.301451612903225e-05, + "loss": 0.188, + "step": 4832 + }, + { + "epoch": 0.077328, + "grad_norm": 0.88671875, + "learning_rate": 9.301290322580645e-05, + "loss": 0.1765, + "step": 4833 + }, + { + "epoch": 0.077344, + "grad_norm": 1.3828125, + "learning_rate": 9.301129032258064e-05, + "loss": 0.2012, + "step": 4834 + }, + { + "epoch": 0.07736, + "grad_norm": 0.49609375, + "learning_rate": 9.300967741935484e-05, + "loss": 0.139, + "step": 4835 + }, + { + "epoch": 0.077376, + "grad_norm": 0.953125, + "learning_rate": 9.300806451612904e-05, + "loss": 0.1793, + "step": 4836 + }, + { + "epoch": 0.077392, + "grad_norm": 0.84375, + "learning_rate": 9.300645161290324e-05, + "loss": 0.1951, + "step": 4837 + }, + { + "epoch": 0.077408, + "grad_norm": 0.55859375, + "learning_rate": 9.300483870967743e-05, + "loss": 0.1568, + "step": 4838 + }, + { + "epoch": 0.077424, + "grad_norm": 0.7109375, + "learning_rate": 9.300322580645163e-05, + "loss": 0.1547, + "step": 4839 + }, + { + "epoch": 0.07744, + "grad_norm": 0.95703125, + "learning_rate": 9.300161290322581e-05, + "loss": 0.2235, + "step": 4840 + }, + { + "epoch": 0.077456, + "grad_norm": 0.66796875, + "learning_rate": 9.300000000000001e-05, + "loss": 0.1751, + "step": 4841 + }, + { + "epoch": 0.077472, + "grad_norm": 1.234375, + "learning_rate": 9.29983870967742e-05, + "loss": 0.1322, + "step": 4842 + }, + { + "epoch": 0.077488, + "grad_norm": 0.6640625, + "learning_rate": 9.299677419354838e-05, + "loss": 0.1538, + "step": 4843 + }, + { + "epoch": 0.077504, + "grad_norm": 0.7109375, + "learning_rate": 9.299516129032258e-05, + "loss": 0.199, + "step": 4844 + }, + { + "epoch": 0.07752, + "grad_norm": 0.640625, + "learning_rate": 9.299354838709677e-05, + "loss": 0.1712, + "step": 4845 + }, + { + "epoch": 0.077536, + "grad_norm": 1.0546875, + "learning_rate": 9.299193548387097e-05, + "loss": 0.1323, + "step": 4846 + }, + { + "epoch": 0.077552, + "grad_norm": 0.70703125, + "learning_rate": 9.299032258064517e-05, + "loss": 0.1667, + "step": 4847 + }, + { + "epoch": 0.077568, + "grad_norm": 0.7734375, + "learning_rate": 9.298870967741937e-05, + "loss": 0.1425, + "step": 4848 + }, + { + "epoch": 0.077584, + "grad_norm": 1.0703125, + "learning_rate": 9.298709677419355e-05, + "loss": 0.1943, + "step": 4849 + }, + { + "epoch": 0.0776, + "grad_norm": 1.125, + "learning_rate": 9.298548387096775e-05, + "loss": 0.2045, + "step": 4850 + }, + { + "epoch": 0.077616, + "grad_norm": 0.65625, + "learning_rate": 9.298387096774194e-05, + "loss": 0.1493, + "step": 4851 + }, + { + "epoch": 0.077632, + "grad_norm": 1.109375, + "learning_rate": 9.298225806451614e-05, + "loss": 0.2624, + "step": 4852 + }, + { + "epoch": 0.077648, + "grad_norm": 0.59765625, + "learning_rate": 9.298064516129033e-05, + "loss": 0.1555, + "step": 4853 + }, + { + "epoch": 0.077664, + "grad_norm": 0.8828125, + "learning_rate": 9.297903225806452e-05, + "loss": 0.188, + "step": 4854 + }, + { + "epoch": 0.07768, + "grad_norm": 1.078125, + "learning_rate": 9.297741935483871e-05, + "loss": 0.1675, + "step": 4855 + }, + { + "epoch": 0.077696, + "grad_norm": 0.609375, + "learning_rate": 9.297580645161291e-05, + "loss": 0.1724, + "step": 4856 + }, + { + "epoch": 0.077712, + "grad_norm": 0.75390625, + "learning_rate": 9.29741935483871e-05, + "loss": 0.1933, + "step": 4857 + }, + { + "epoch": 0.077728, + "grad_norm": 1.0078125, + "learning_rate": 9.29725806451613e-05, + "loss": 0.2189, + "step": 4858 + }, + { + "epoch": 0.077744, + "grad_norm": 0.65234375, + "learning_rate": 9.297096774193548e-05, + "loss": 0.1963, + "step": 4859 + }, + { + "epoch": 0.07776, + "grad_norm": 0.76171875, + "learning_rate": 9.296935483870968e-05, + "loss": 0.1739, + "step": 4860 + }, + { + "epoch": 0.077776, + "grad_norm": 0.95703125, + "learning_rate": 9.296774193548388e-05, + "loss": 0.1595, + "step": 4861 + }, + { + "epoch": 0.077792, + "grad_norm": 0.609375, + "learning_rate": 9.296612903225807e-05, + "loss": 0.1733, + "step": 4862 + }, + { + "epoch": 0.077808, + "grad_norm": 1.3671875, + "learning_rate": 9.296451612903227e-05, + "loss": 0.1745, + "step": 4863 + }, + { + "epoch": 0.077824, + "grad_norm": 0.828125, + "learning_rate": 9.296290322580645e-05, + "loss": 0.2057, + "step": 4864 + }, + { + "epoch": 0.07784, + "grad_norm": 0.6875, + "learning_rate": 9.296129032258065e-05, + "loss": 0.1833, + "step": 4865 + }, + { + "epoch": 0.077856, + "grad_norm": 0.97265625, + "learning_rate": 9.295967741935484e-05, + "loss": 0.1996, + "step": 4866 + }, + { + "epoch": 0.077872, + "grad_norm": 1.1328125, + "learning_rate": 9.295806451612904e-05, + "loss": 0.1804, + "step": 4867 + }, + { + "epoch": 0.077888, + "grad_norm": 0.71875, + "learning_rate": 9.295645161290322e-05, + "loss": 0.211, + "step": 4868 + }, + { + "epoch": 0.077904, + "grad_norm": 0.76171875, + "learning_rate": 9.295483870967742e-05, + "loss": 0.2011, + "step": 4869 + }, + { + "epoch": 0.07792, + "grad_norm": 1.09375, + "learning_rate": 9.295322580645161e-05, + "loss": 0.2048, + "step": 4870 + }, + { + "epoch": 0.077936, + "grad_norm": 0.66796875, + "learning_rate": 9.295161290322581e-05, + "loss": 0.1392, + "step": 4871 + }, + { + "epoch": 0.077952, + "grad_norm": 0.69140625, + "learning_rate": 9.295000000000001e-05, + "loss": 0.1878, + "step": 4872 + }, + { + "epoch": 0.077968, + "grad_norm": 0.88671875, + "learning_rate": 9.294838709677421e-05, + "loss": 0.1798, + "step": 4873 + }, + { + "epoch": 0.077984, + "grad_norm": 0.765625, + "learning_rate": 9.29467741935484e-05, + "loss": 0.1845, + "step": 4874 + }, + { + "epoch": 0.078, + "grad_norm": 0.91796875, + "learning_rate": 9.294516129032258e-05, + "loss": 0.1717, + "step": 4875 + }, + { + "epoch": 0.078016, + "grad_norm": 0.69140625, + "learning_rate": 9.294354838709678e-05, + "loss": 0.1833, + "step": 4876 + }, + { + "epoch": 0.078032, + "grad_norm": 0.7265625, + "learning_rate": 9.294193548387097e-05, + "loss": 0.2121, + "step": 4877 + }, + { + "epoch": 0.078048, + "grad_norm": 0.69921875, + "learning_rate": 9.294032258064517e-05, + "loss": 0.1426, + "step": 4878 + }, + { + "epoch": 0.078064, + "grad_norm": 1.1953125, + "learning_rate": 9.293870967741935e-05, + "loss": 0.2493, + "step": 4879 + }, + { + "epoch": 0.07808, + "grad_norm": 0.765625, + "learning_rate": 9.293709677419355e-05, + "loss": 0.1787, + "step": 4880 + }, + { + "epoch": 0.078096, + "grad_norm": 1.234375, + "learning_rate": 9.293548387096774e-05, + "loss": 0.1819, + "step": 4881 + }, + { + "epoch": 0.078112, + "grad_norm": 0.9921875, + "learning_rate": 9.293387096774194e-05, + "loss": 0.1891, + "step": 4882 + }, + { + "epoch": 0.078128, + "grad_norm": 0.890625, + "learning_rate": 9.293225806451614e-05, + "loss": 0.1749, + "step": 4883 + }, + { + "epoch": 0.078144, + "grad_norm": 1.2890625, + "learning_rate": 9.293064516129032e-05, + "loss": 0.2306, + "step": 4884 + }, + { + "epoch": 0.07816, + "grad_norm": 1.09375, + "learning_rate": 9.292903225806452e-05, + "loss": 0.1385, + "step": 4885 + }, + { + "epoch": 0.078176, + "grad_norm": 0.64453125, + "learning_rate": 9.292741935483872e-05, + "loss": 0.19, + "step": 4886 + }, + { + "epoch": 0.078192, + "grad_norm": 1.0625, + "learning_rate": 9.292580645161291e-05, + "loss": 0.2117, + "step": 4887 + }, + { + "epoch": 0.078208, + "grad_norm": 0.75, + "learning_rate": 9.292419354838711e-05, + "loss": 0.1912, + "step": 4888 + }, + { + "epoch": 0.078224, + "grad_norm": 1.2578125, + "learning_rate": 9.29225806451613e-05, + "loss": 0.1897, + "step": 4889 + }, + { + "epoch": 0.07824, + "grad_norm": 0.96484375, + "learning_rate": 9.292096774193548e-05, + "loss": 0.1784, + "step": 4890 + }, + { + "epoch": 0.078256, + "grad_norm": 0.84765625, + "learning_rate": 9.291935483870968e-05, + "loss": 0.1667, + "step": 4891 + }, + { + "epoch": 0.078272, + "grad_norm": 0.875, + "learning_rate": 9.291774193548387e-05, + "loss": 0.1884, + "step": 4892 + }, + { + "epoch": 0.078288, + "grad_norm": 0.78125, + "learning_rate": 9.291612903225807e-05, + "loss": 0.188, + "step": 4893 + }, + { + "epoch": 0.078304, + "grad_norm": 0.7421875, + "learning_rate": 9.291451612903225e-05, + "loss": 0.1814, + "step": 4894 + }, + { + "epoch": 0.07832, + "grad_norm": 1.0078125, + "learning_rate": 9.291290322580645e-05, + "loss": 0.2097, + "step": 4895 + }, + { + "epoch": 0.078336, + "grad_norm": 0.78125, + "learning_rate": 9.291129032258065e-05, + "loss": 0.1968, + "step": 4896 + }, + { + "epoch": 0.078352, + "grad_norm": 1.0859375, + "learning_rate": 9.290967741935485e-05, + "loss": 0.2286, + "step": 4897 + }, + { + "epoch": 0.078368, + "grad_norm": 1.015625, + "learning_rate": 9.290806451612904e-05, + "loss": 0.1875, + "step": 4898 + }, + { + "epoch": 0.078384, + "grad_norm": 0.87109375, + "learning_rate": 9.290645161290324e-05, + "loss": 0.1774, + "step": 4899 + }, + { + "epoch": 0.0784, + "grad_norm": 1.2890625, + "learning_rate": 9.290483870967742e-05, + "loss": 0.1919, + "step": 4900 + }, + { + "epoch": 0.078416, + "grad_norm": 0.640625, + "learning_rate": 9.290322580645162e-05, + "loss": 0.1701, + "step": 4901 + }, + { + "epoch": 0.078432, + "grad_norm": 1.09375, + "learning_rate": 9.290161290322581e-05, + "loss": 0.1727, + "step": 4902 + }, + { + "epoch": 0.078448, + "grad_norm": 0.8046875, + "learning_rate": 9.290000000000001e-05, + "loss": 0.1885, + "step": 4903 + }, + { + "epoch": 0.078464, + "grad_norm": 1.1015625, + "learning_rate": 9.28983870967742e-05, + "loss": 0.1952, + "step": 4904 + }, + { + "epoch": 0.07848, + "grad_norm": 0.625, + "learning_rate": 9.28967741935484e-05, + "loss": 0.1575, + "step": 4905 + }, + { + "epoch": 0.078496, + "grad_norm": 0.8359375, + "learning_rate": 9.289516129032258e-05, + "loss": 0.1981, + "step": 4906 + }, + { + "epoch": 0.078512, + "grad_norm": 1.359375, + "learning_rate": 9.289354838709678e-05, + "loss": 0.1339, + "step": 4907 + }, + { + "epoch": 0.078528, + "grad_norm": 0.859375, + "learning_rate": 9.289193548387098e-05, + "loss": 0.1931, + "step": 4908 + }, + { + "epoch": 0.078544, + "grad_norm": 1.0859375, + "learning_rate": 9.289032258064517e-05, + "loss": 0.2131, + "step": 4909 + }, + { + "epoch": 0.07856, + "grad_norm": 0.84765625, + "learning_rate": 9.288870967741937e-05, + "loss": 0.2078, + "step": 4910 + }, + { + "epoch": 0.078576, + "grad_norm": 0.78125, + "learning_rate": 9.288709677419355e-05, + "loss": 0.2043, + "step": 4911 + }, + { + "epoch": 0.078592, + "grad_norm": 0.75, + "learning_rate": 9.288548387096775e-05, + "loss": 0.1988, + "step": 4912 + }, + { + "epoch": 0.078608, + "grad_norm": 0.796875, + "learning_rate": 9.288387096774194e-05, + "loss": 0.1589, + "step": 4913 + }, + { + "epoch": 0.078624, + "grad_norm": 1.28125, + "learning_rate": 9.288225806451614e-05, + "loss": 0.2493, + "step": 4914 + }, + { + "epoch": 0.07864, + "grad_norm": 0.83984375, + "learning_rate": 9.288064516129032e-05, + "loss": 0.1792, + "step": 4915 + }, + { + "epoch": 0.078656, + "grad_norm": 0.90234375, + "learning_rate": 9.287903225806452e-05, + "loss": 0.1966, + "step": 4916 + }, + { + "epoch": 0.078672, + "grad_norm": 0.8828125, + "learning_rate": 9.287741935483871e-05, + "loss": 0.1539, + "step": 4917 + }, + { + "epoch": 0.078688, + "grad_norm": 0.765625, + "learning_rate": 9.287580645161291e-05, + "loss": 0.1508, + "step": 4918 + }, + { + "epoch": 0.078704, + "grad_norm": 1.515625, + "learning_rate": 9.28741935483871e-05, + "loss": 0.2264, + "step": 4919 + }, + { + "epoch": 0.07872, + "grad_norm": 0.6015625, + "learning_rate": 9.28725806451613e-05, + "loss": 0.1953, + "step": 4920 + }, + { + "epoch": 0.078736, + "grad_norm": 0.70703125, + "learning_rate": 9.28709677419355e-05, + "loss": 0.1358, + "step": 4921 + }, + { + "epoch": 0.078752, + "grad_norm": 0.98828125, + "learning_rate": 9.286935483870968e-05, + "loss": 0.1513, + "step": 4922 + }, + { + "epoch": 0.078768, + "grad_norm": 0.890625, + "learning_rate": 9.286774193548388e-05, + "loss": 0.2006, + "step": 4923 + }, + { + "epoch": 0.078784, + "grad_norm": 0.86328125, + "learning_rate": 9.286612903225807e-05, + "loss": 0.1726, + "step": 4924 + }, + { + "epoch": 0.0788, + "grad_norm": 1.5625, + "learning_rate": 9.286451612903226e-05, + "loss": 0.1648, + "step": 4925 + }, + { + "epoch": 0.078816, + "grad_norm": 1.359375, + "learning_rate": 9.286290322580645e-05, + "loss": 0.1952, + "step": 4926 + }, + { + "epoch": 0.078832, + "grad_norm": 0.7734375, + "learning_rate": 9.286129032258065e-05, + "loss": 0.1905, + "step": 4927 + }, + { + "epoch": 0.078848, + "grad_norm": 1.1171875, + "learning_rate": 9.285967741935484e-05, + "loss": 0.1765, + "step": 4928 + }, + { + "epoch": 0.078864, + "grad_norm": 0.86328125, + "learning_rate": 9.285806451612904e-05, + "loss": 0.1894, + "step": 4929 + }, + { + "epoch": 0.07888, + "grad_norm": 0.96484375, + "learning_rate": 9.285645161290322e-05, + "loss": 0.1938, + "step": 4930 + }, + { + "epoch": 0.078896, + "grad_norm": 0.87109375, + "learning_rate": 9.285483870967742e-05, + "loss": 0.16, + "step": 4931 + }, + { + "epoch": 0.078912, + "grad_norm": 0.95703125, + "learning_rate": 9.285322580645162e-05, + "loss": 0.1687, + "step": 4932 + }, + { + "epoch": 0.078928, + "grad_norm": 0.671875, + "learning_rate": 9.285161290322582e-05, + "loss": 0.1668, + "step": 4933 + }, + { + "epoch": 0.078944, + "grad_norm": 0.8828125, + "learning_rate": 9.285000000000001e-05, + "loss": 0.2074, + "step": 4934 + }, + { + "epoch": 0.07896, + "grad_norm": 0.96875, + "learning_rate": 9.284838709677421e-05, + "loss": 0.1802, + "step": 4935 + }, + { + "epoch": 0.078976, + "grad_norm": 1.203125, + "learning_rate": 9.284677419354839e-05, + "loss": 0.2066, + "step": 4936 + }, + { + "epoch": 0.078992, + "grad_norm": 0.71875, + "learning_rate": 9.284516129032258e-05, + "loss": 0.1769, + "step": 4937 + }, + { + "epoch": 0.079008, + "grad_norm": 0.9765625, + "learning_rate": 9.284354838709678e-05, + "loss": 0.1711, + "step": 4938 + }, + { + "epoch": 0.079024, + "grad_norm": 0.9765625, + "learning_rate": 9.284193548387096e-05, + "loss": 0.2191, + "step": 4939 + }, + { + "epoch": 0.07904, + "grad_norm": 0.90234375, + "learning_rate": 9.284032258064516e-05, + "loss": 0.1704, + "step": 4940 + }, + { + "epoch": 0.079056, + "grad_norm": 0.99609375, + "learning_rate": 9.283870967741935e-05, + "loss": 0.1424, + "step": 4941 + }, + { + "epoch": 0.079072, + "grad_norm": 1.015625, + "learning_rate": 9.283709677419355e-05, + "loss": 0.1847, + "step": 4942 + }, + { + "epoch": 0.079088, + "grad_norm": 0.9921875, + "learning_rate": 9.283548387096775e-05, + "loss": 0.1457, + "step": 4943 + }, + { + "epoch": 0.079104, + "grad_norm": 0.62890625, + "learning_rate": 9.283387096774195e-05, + "loss": 0.1719, + "step": 4944 + }, + { + "epoch": 0.07912, + "grad_norm": 1.0, + "learning_rate": 9.283225806451614e-05, + "loss": 0.138, + "step": 4945 + }, + { + "epoch": 0.079136, + "grad_norm": 0.98828125, + "learning_rate": 9.283064516129033e-05, + "loss": 0.207, + "step": 4946 + }, + { + "epoch": 0.079152, + "grad_norm": 0.703125, + "learning_rate": 9.282903225806452e-05, + "loss": 0.2117, + "step": 4947 + }, + { + "epoch": 0.079168, + "grad_norm": 1.1484375, + "learning_rate": 9.282741935483872e-05, + "loss": 0.2119, + "step": 4948 + }, + { + "epoch": 0.079184, + "grad_norm": 0.7109375, + "learning_rate": 9.28258064516129e-05, + "loss": 0.1962, + "step": 4949 + }, + { + "epoch": 0.0792, + "grad_norm": 1.1015625, + "learning_rate": 9.28241935483871e-05, + "loss": 0.2043, + "step": 4950 + }, + { + "epoch": 0.079216, + "grad_norm": 1.2734375, + "learning_rate": 9.282258064516129e-05, + "loss": 0.2297, + "step": 4951 + }, + { + "epoch": 0.079232, + "grad_norm": 0.61328125, + "learning_rate": 9.282096774193548e-05, + "loss": 0.2015, + "step": 4952 + }, + { + "epoch": 0.079248, + "grad_norm": 0.69921875, + "learning_rate": 9.281935483870968e-05, + "loss": 0.1672, + "step": 4953 + }, + { + "epoch": 0.079264, + "grad_norm": 0.9296875, + "learning_rate": 9.281774193548386e-05, + "loss": 0.2104, + "step": 4954 + }, + { + "epoch": 0.07928, + "grad_norm": 0.90625, + "learning_rate": 9.281612903225806e-05, + "loss": 0.2081, + "step": 4955 + }, + { + "epoch": 0.079296, + "grad_norm": 1.0078125, + "learning_rate": 9.281451612903226e-05, + "loss": 0.1519, + "step": 4956 + }, + { + "epoch": 0.079312, + "grad_norm": 0.578125, + "learning_rate": 9.281290322580646e-05, + "loss": 0.1567, + "step": 4957 + }, + { + "epoch": 0.079328, + "grad_norm": 0.87890625, + "learning_rate": 9.281129032258065e-05, + "loss": 0.2049, + "step": 4958 + }, + { + "epoch": 0.079344, + "grad_norm": 0.66796875, + "learning_rate": 9.280967741935485e-05, + "loss": 0.1897, + "step": 4959 + }, + { + "epoch": 0.07936, + "grad_norm": 0.72265625, + "learning_rate": 9.280806451612903e-05, + "loss": 0.2075, + "step": 4960 + }, + { + "epoch": 0.079376, + "grad_norm": 0.86328125, + "learning_rate": 9.280645161290323e-05, + "loss": 0.1362, + "step": 4961 + }, + { + "epoch": 0.079392, + "grad_norm": 0.8046875, + "learning_rate": 9.280483870967742e-05, + "loss": 0.1736, + "step": 4962 + }, + { + "epoch": 0.079408, + "grad_norm": 0.71484375, + "learning_rate": 9.280322580645162e-05, + "loss": 0.2285, + "step": 4963 + }, + { + "epoch": 0.079424, + "grad_norm": 1.0546875, + "learning_rate": 9.28016129032258e-05, + "loss": 0.2093, + "step": 4964 + }, + { + "epoch": 0.07944, + "grad_norm": 0.8046875, + "learning_rate": 9.28e-05, + "loss": 0.1236, + "step": 4965 + }, + { + "epoch": 0.079456, + "grad_norm": 0.63671875, + "learning_rate": 9.279838709677419e-05, + "loss": 0.1968, + "step": 4966 + }, + { + "epoch": 0.079472, + "grad_norm": 1.3984375, + "learning_rate": 9.279677419354839e-05, + "loss": 0.1729, + "step": 4967 + }, + { + "epoch": 0.079488, + "grad_norm": 0.62109375, + "learning_rate": 9.279516129032259e-05, + "loss": 0.2, + "step": 4968 + }, + { + "epoch": 0.079504, + "grad_norm": 0.66015625, + "learning_rate": 9.279354838709678e-05, + "loss": 0.1707, + "step": 4969 + }, + { + "epoch": 0.07952, + "grad_norm": 0.84375, + "learning_rate": 9.279193548387098e-05, + "loss": 0.1909, + "step": 4970 + }, + { + "epoch": 0.079536, + "grad_norm": 0.86328125, + "learning_rate": 9.279032258064516e-05, + "loss": 0.1818, + "step": 4971 + }, + { + "epoch": 0.079552, + "grad_norm": 0.6953125, + "learning_rate": 9.278870967741936e-05, + "loss": 0.1606, + "step": 4972 + }, + { + "epoch": 0.079568, + "grad_norm": 0.78125, + "learning_rate": 9.278709677419355e-05, + "loss": 0.1796, + "step": 4973 + }, + { + "epoch": 0.079584, + "grad_norm": 0.734375, + "learning_rate": 9.278548387096775e-05, + "loss": 0.1816, + "step": 4974 + }, + { + "epoch": 0.0796, + "grad_norm": 0.625, + "learning_rate": 9.278387096774193e-05, + "loss": 0.1862, + "step": 4975 + }, + { + "epoch": 0.079616, + "grad_norm": 0.8125, + "learning_rate": 9.278225806451613e-05, + "loss": 0.2219, + "step": 4976 + }, + { + "epoch": 0.079632, + "grad_norm": 0.875, + "learning_rate": 9.278064516129032e-05, + "loss": 0.1962, + "step": 4977 + }, + { + "epoch": 0.079648, + "grad_norm": 0.83203125, + "learning_rate": 9.277903225806452e-05, + "loss": 0.1554, + "step": 4978 + }, + { + "epoch": 0.079664, + "grad_norm": 0.875, + "learning_rate": 9.277741935483872e-05, + "loss": 0.1393, + "step": 4979 + }, + { + "epoch": 0.07968, + "grad_norm": 0.74609375, + "learning_rate": 9.27758064516129e-05, + "loss": 0.1709, + "step": 4980 + }, + { + "epoch": 0.079696, + "grad_norm": 0.6796875, + "learning_rate": 9.27741935483871e-05, + "loss": 0.1778, + "step": 4981 + }, + { + "epoch": 0.079712, + "grad_norm": 0.6796875, + "learning_rate": 9.27725806451613e-05, + "loss": 0.1542, + "step": 4982 + }, + { + "epoch": 0.079728, + "grad_norm": 0.86328125, + "learning_rate": 9.277096774193549e-05, + "loss": 0.2006, + "step": 4983 + }, + { + "epoch": 0.079744, + "grad_norm": 0.76953125, + "learning_rate": 9.276935483870968e-05, + "loss": 0.1956, + "step": 4984 + }, + { + "epoch": 0.07976, + "grad_norm": 1.1015625, + "learning_rate": 9.276774193548388e-05, + "loss": 0.1905, + "step": 4985 + }, + { + "epoch": 0.079776, + "grad_norm": 1.4609375, + "learning_rate": 9.276612903225806e-05, + "loss": 0.1876, + "step": 4986 + }, + { + "epoch": 0.079792, + "grad_norm": 0.98046875, + "learning_rate": 9.276451612903226e-05, + "loss": 0.1888, + "step": 4987 + }, + { + "epoch": 0.079808, + "grad_norm": 0.7578125, + "learning_rate": 9.276290322580645e-05, + "loss": 0.1782, + "step": 4988 + }, + { + "epoch": 0.079824, + "grad_norm": 0.82421875, + "learning_rate": 9.276129032258065e-05, + "loss": 0.1868, + "step": 4989 + }, + { + "epoch": 0.07984, + "grad_norm": 0.7578125, + "learning_rate": 9.275967741935483e-05, + "loss": 0.1765, + "step": 4990 + }, + { + "epoch": 0.079856, + "grad_norm": 1.2421875, + "learning_rate": 9.275806451612903e-05, + "loss": 0.2643, + "step": 4991 + }, + { + "epoch": 0.079872, + "grad_norm": 0.98046875, + "learning_rate": 9.275645161290323e-05, + "loss": 0.1922, + "step": 4992 + }, + { + "epoch": 0.079888, + "grad_norm": 0.6875, + "learning_rate": 9.275483870967743e-05, + "loss": 0.1725, + "step": 4993 + }, + { + "epoch": 0.079904, + "grad_norm": 0.79296875, + "learning_rate": 9.275322580645162e-05, + "loss": 0.1395, + "step": 4994 + }, + { + "epoch": 0.07992, + "grad_norm": 0.9140625, + "learning_rate": 9.275161290322582e-05, + "loss": 0.1487, + "step": 4995 + }, + { + "epoch": 0.079936, + "grad_norm": 1.234375, + "learning_rate": 9.275e-05, + "loss": 0.1826, + "step": 4996 + }, + { + "epoch": 0.079952, + "grad_norm": 0.97265625, + "learning_rate": 9.27483870967742e-05, + "loss": 0.2011, + "step": 4997 + }, + { + "epoch": 0.079968, + "grad_norm": 0.9453125, + "learning_rate": 9.274677419354839e-05, + "loss": 0.1586, + "step": 4998 + }, + { + "epoch": 0.079984, + "grad_norm": 1.0390625, + "learning_rate": 9.274516129032258e-05, + "loss": 0.2126, + "step": 4999 + }, + { + "epoch": 0.08, + "grad_norm": 1.03125, + "learning_rate": 9.274354838709678e-05, + "loss": 0.2455, + "step": 5000 + }, + { + "epoch": 0.080016, + "grad_norm": 0.87890625, + "learning_rate": 9.274193548387096e-05, + "loss": 0.1953, + "step": 5001 + }, + { + "epoch": 0.080032, + "grad_norm": 0.5078125, + "learning_rate": 9.274032258064516e-05, + "loss": 0.1513, + "step": 5002 + }, + { + "epoch": 0.080048, + "grad_norm": 0.8515625, + "learning_rate": 9.273870967741936e-05, + "loss": 0.2245, + "step": 5003 + }, + { + "epoch": 0.080064, + "grad_norm": 0.875, + "learning_rate": 9.273709677419356e-05, + "loss": 0.2207, + "step": 5004 + }, + { + "epoch": 0.08008, + "grad_norm": 0.98828125, + "learning_rate": 9.273548387096775e-05, + "loss": 0.2219, + "step": 5005 + }, + { + "epoch": 0.080096, + "grad_norm": 0.44921875, + "learning_rate": 9.273387096774195e-05, + "loss": 0.1571, + "step": 5006 + }, + { + "epoch": 0.080112, + "grad_norm": 0.8515625, + "learning_rate": 9.273225806451613e-05, + "loss": 0.232, + "step": 5007 + }, + { + "epoch": 0.080128, + "grad_norm": 0.6953125, + "learning_rate": 9.273064516129033e-05, + "loss": 0.1968, + "step": 5008 + }, + { + "epoch": 0.080144, + "grad_norm": 1.28125, + "learning_rate": 9.272903225806452e-05, + "loss": 0.1817, + "step": 5009 + }, + { + "epoch": 0.08016, + "grad_norm": 1.3984375, + "learning_rate": 9.272741935483872e-05, + "loss": 0.1921, + "step": 5010 + }, + { + "epoch": 0.080176, + "grad_norm": 0.67578125, + "learning_rate": 9.27258064516129e-05, + "loss": 0.1716, + "step": 5011 + }, + { + "epoch": 0.080192, + "grad_norm": 0.6640625, + "learning_rate": 9.27241935483871e-05, + "loss": 0.1993, + "step": 5012 + }, + { + "epoch": 0.080208, + "grad_norm": 0.58203125, + "learning_rate": 9.272258064516129e-05, + "loss": 0.1722, + "step": 5013 + }, + { + "epoch": 0.080224, + "grad_norm": 1.203125, + "learning_rate": 9.272096774193549e-05, + "loss": 0.1871, + "step": 5014 + }, + { + "epoch": 0.08024, + "grad_norm": 1.2109375, + "learning_rate": 9.271935483870968e-05, + "loss": 0.1637, + "step": 5015 + }, + { + "epoch": 0.080256, + "grad_norm": 1.0390625, + "learning_rate": 9.271774193548388e-05, + "loss": 0.185, + "step": 5016 + }, + { + "epoch": 0.080272, + "grad_norm": 1.546875, + "learning_rate": 9.271612903225808e-05, + "loss": 0.1778, + "step": 5017 + }, + { + "epoch": 0.080288, + "grad_norm": 0.9140625, + "learning_rate": 9.271451612903226e-05, + "loss": 0.1649, + "step": 5018 + }, + { + "epoch": 0.080304, + "grad_norm": 1.3828125, + "learning_rate": 9.271290322580646e-05, + "loss": 0.2247, + "step": 5019 + }, + { + "epoch": 0.08032, + "grad_norm": 0.78125, + "learning_rate": 9.271129032258065e-05, + "loss": 0.1288, + "step": 5020 + }, + { + "epoch": 0.080336, + "grad_norm": 1.3671875, + "learning_rate": 9.270967741935485e-05, + "loss": 0.1733, + "step": 5021 + }, + { + "epoch": 0.080352, + "grad_norm": 1.9375, + "learning_rate": 9.270806451612903e-05, + "loss": 0.1909, + "step": 5022 + }, + { + "epoch": 0.080368, + "grad_norm": 1.0546875, + "learning_rate": 9.270645161290323e-05, + "loss": 0.1649, + "step": 5023 + }, + { + "epoch": 0.080384, + "grad_norm": 1.3046875, + "learning_rate": 9.270483870967742e-05, + "loss": 0.1947, + "step": 5024 + }, + { + "epoch": 0.0804, + "grad_norm": 1.7421875, + "learning_rate": 9.270322580645162e-05, + "loss": 0.1861, + "step": 5025 + }, + { + "epoch": 0.080416, + "grad_norm": 0.97265625, + "learning_rate": 9.27016129032258e-05, + "loss": 0.1759, + "step": 5026 + }, + { + "epoch": 0.080432, + "grad_norm": 1.578125, + "learning_rate": 9.27e-05, + "loss": 0.196, + "step": 5027 + }, + { + "epoch": 0.080448, + "grad_norm": 2.1875, + "learning_rate": 9.26983870967742e-05, + "loss": 0.2159, + "step": 5028 + }, + { + "epoch": 0.080464, + "grad_norm": 1.1171875, + "learning_rate": 9.26967741935484e-05, + "loss": 0.2078, + "step": 5029 + }, + { + "epoch": 0.08048, + "grad_norm": 1.078125, + "learning_rate": 9.269516129032259e-05, + "loss": 0.1348, + "step": 5030 + }, + { + "epoch": 0.080496, + "grad_norm": 0.86328125, + "learning_rate": 9.269354838709677e-05, + "loss": 0.1094, + "step": 5031 + }, + { + "epoch": 0.080512, + "grad_norm": 0.90625, + "learning_rate": 9.269193548387097e-05, + "loss": 0.1803, + "step": 5032 + }, + { + "epoch": 0.080528, + "grad_norm": 0.9296875, + "learning_rate": 9.269032258064516e-05, + "loss": 0.143, + "step": 5033 + }, + { + "epoch": 0.080544, + "grad_norm": 1.0625, + "learning_rate": 9.268870967741936e-05, + "loss": 0.1964, + "step": 5034 + }, + { + "epoch": 0.08056, + "grad_norm": 0.84375, + "learning_rate": 9.268709677419355e-05, + "loss": 0.1955, + "step": 5035 + }, + { + "epoch": 0.080576, + "grad_norm": 0.94140625, + "learning_rate": 9.268548387096775e-05, + "loss": 0.1497, + "step": 5036 + }, + { + "epoch": 0.080592, + "grad_norm": 0.921875, + "learning_rate": 9.268387096774193e-05, + "loss": 0.1757, + "step": 5037 + }, + { + "epoch": 0.080608, + "grad_norm": 1.359375, + "learning_rate": 9.268225806451613e-05, + "loss": 0.1689, + "step": 5038 + }, + { + "epoch": 0.080624, + "grad_norm": 1.1015625, + "learning_rate": 9.268064516129033e-05, + "loss": 0.1802, + "step": 5039 + }, + { + "epoch": 0.08064, + "grad_norm": 0.69140625, + "learning_rate": 9.267903225806453e-05, + "loss": 0.1769, + "step": 5040 + }, + { + "epoch": 0.080656, + "grad_norm": 0.81640625, + "learning_rate": 9.267741935483872e-05, + "loss": 0.1075, + "step": 5041 + }, + { + "epoch": 0.080672, + "grad_norm": 1.0234375, + "learning_rate": 9.267580645161292e-05, + "loss": 0.1791, + "step": 5042 + }, + { + "epoch": 0.080688, + "grad_norm": 1.0, + "learning_rate": 9.26741935483871e-05, + "loss": 0.1831, + "step": 5043 + }, + { + "epoch": 0.080704, + "grad_norm": 0.69140625, + "learning_rate": 9.26725806451613e-05, + "loss": 0.1584, + "step": 5044 + }, + { + "epoch": 0.08072, + "grad_norm": 0.6328125, + "learning_rate": 9.267096774193549e-05, + "loss": 0.1762, + "step": 5045 + }, + { + "epoch": 0.080736, + "grad_norm": 1.203125, + "learning_rate": 9.266935483870967e-05, + "loss": 0.1859, + "step": 5046 + }, + { + "epoch": 0.080752, + "grad_norm": 0.796875, + "learning_rate": 9.266774193548387e-05, + "loss": 0.2117, + "step": 5047 + }, + { + "epoch": 0.080768, + "grad_norm": 0.7890625, + "learning_rate": 9.266612903225806e-05, + "loss": 0.2253, + "step": 5048 + }, + { + "epoch": 0.080784, + "grad_norm": 1.3203125, + "learning_rate": 9.266451612903226e-05, + "loss": 0.2188, + "step": 5049 + }, + { + "epoch": 0.0808, + "grad_norm": 0.5625, + "learning_rate": 9.266290322580645e-05, + "loss": 0.1636, + "step": 5050 + }, + { + "epoch": 0.080816, + "grad_norm": 1.1640625, + "learning_rate": 9.266129032258065e-05, + "loss": 0.1901, + "step": 5051 + }, + { + "epoch": 0.080832, + "grad_norm": 0.69921875, + "learning_rate": 9.265967741935485e-05, + "loss": 0.2141, + "step": 5052 + }, + { + "epoch": 0.080848, + "grad_norm": 1.734375, + "learning_rate": 9.265806451612904e-05, + "loss": 0.2264, + "step": 5053 + }, + { + "epoch": 0.080864, + "grad_norm": 0.80078125, + "learning_rate": 9.265645161290323e-05, + "loss": 0.1748, + "step": 5054 + }, + { + "epoch": 0.08088, + "grad_norm": 0.93359375, + "learning_rate": 9.265483870967743e-05, + "loss": 0.1571, + "step": 5055 + }, + { + "epoch": 0.080896, + "grad_norm": 1.0625, + "learning_rate": 9.265322580645162e-05, + "loss": 0.1544, + "step": 5056 + }, + { + "epoch": 0.080912, + "grad_norm": 0.734375, + "learning_rate": 9.265161290322582e-05, + "loss": 0.1942, + "step": 5057 + }, + { + "epoch": 0.080928, + "grad_norm": 0.69140625, + "learning_rate": 9.265e-05, + "loss": 0.1919, + "step": 5058 + }, + { + "epoch": 0.080944, + "grad_norm": 0.78515625, + "learning_rate": 9.26483870967742e-05, + "loss": 0.1612, + "step": 5059 + }, + { + "epoch": 0.08096, + "grad_norm": 1.15625, + "learning_rate": 9.264677419354839e-05, + "loss": 0.1655, + "step": 5060 + }, + { + "epoch": 0.080976, + "grad_norm": 1.140625, + "learning_rate": 9.264516129032257e-05, + "loss": 0.1944, + "step": 5061 + }, + { + "epoch": 0.080992, + "grad_norm": 0.69140625, + "learning_rate": 9.264354838709677e-05, + "loss": 0.2176, + "step": 5062 + }, + { + "epoch": 0.081008, + "grad_norm": 1.2421875, + "learning_rate": 9.264193548387097e-05, + "loss": 0.2036, + "step": 5063 + }, + { + "epoch": 0.081024, + "grad_norm": 0.90234375, + "learning_rate": 9.264032258064517e-05, + "loss": 0.2047, + "step": 5064 + }, + { + "epoch": 0.08104, + "grad_norm": 0.7265625, + "learning_rate": 9.263870967741936e-05, + "loss": 0.1847, + "step": 5065 + }, + { + "epoch": 0.081056, + "grad_norm": 0.64453125, + "learning_rate": 9.263709677419356e-05, + "loss": 0.1594, + "step": 5066 + }, + { + "epoch": 0.081072, + "grad_norm": 0.81640625, + "learning_rate": 9.263548387096774e-05, + "loss": 0.1714, + "step": 5067 + }, + { + "epoch": 0.081088, + "grad_norm": 0.65625, + "learning_rate": 9.263387096774194e-05, + "loss": 0.1673, + "step": 5068 + }, + { + "epoch": 0.081104, + "grad_norm": 0.96875, + "learning_rate": 9.263225806451613e-05, + "loss": 0.1284, + "step": 5069 + }, + { + "epoch": 0.08112, + "grad_norm": 1.171875, + "learning_rate": 9.263064516129033e-05, + "loss": 0.2079, + "step": 5070 + }, + { + "epoch": 0.081136, + "grad_norm": 0.7890625, + "learning_rate": 9.262903225806452e-05, + "loss": 0.1759, + "step": 5071 + }, + { + "epoch": 0.081152, + "grad_norm": 0.73046875, + "learning_rate": 9.262741935483872e-05, + "loss": 0.1749, + "step": 5072 + }, + { + "epoch": 0.081168, + "grad_norm": 0.56640625, + "learning_rate": 9.26258064516129e-05, + "loss": 0.1716, + "step": 5073 + }, + { + "epoch": 0.081184, + "grad_norm": 0.63671875, + "learning_rate": 9.26241935483871e-05, + "loss": 0.1935, + "step": 5074 + }, + { + "epoch": 0.0812, + "grad_norm": 0.734375, + "learning_rate": 9.262258064516129e-05, + "loss": 0.154, + "step": 5075 + }, + { + "epoch": 0.081216, + "grad_norm": 1.1484375, + "learning_rate": 9.262096774193549e-05, + "loss": 0.2094, + "step": 5076 + }, + { + "epoch": 0.081232, + "grad_norm": 1.109375, + "learning_rate": 9.261935483870969e-05, + "loss": 0.1728, + "step": 5077 + }, + { + "epoch": 0.081248, + "grad_norm": 0.81640625, + "learning_rate": 9.261774193548387e-05, + "loss": 0.1869, + "step": 5078 + }, + { + "epoch": 0.081264, + "grad_norm": 0.5859375, + "learning_rate": 9.261612903225807e-05, + "loss": 0.1522, + "step": 5079 + }, + { + "epoch": 0.08128, + "grad_norm": 0.93359375, + "learning_rate": 9.261451612903226e-05, + "loss": 0.1775, + "step": 5080 + }, + { + "epoch": 0.081296, + "grad_norm": 1.0, + "learning_rate": 9.261290322580646e-05, + "loss": 0.1954, + "step": 5081 + }, + { + "epoch": 0.081312, + "grad_norm": 0.8515625, + "learning_rate": 9.261129032258064e-05, + "loss": 0.1502, + "step": 5082 + }, + { + "epoch": 0.081328, + "grad_norm": 0.7265625, + "learning_rate": 9.260967741935484e-05, + "loss": 0.2043, + "step": 5083 + }, + { + "epoch": 0.081344, + "grad_norm": 0.734375, + "learning_rate": 9.260806451612903e-05, + "loss": 0.1633, + "step": 5084 + }, + { + "epoch": 0.08136, + "grad_norm": 0.67578125, + "learning_rate": 9.260645161290323e-05, + "loss": 0.1429, + "step": 5085 + }, + { + "epoch": 0.081376, + "grad_norm": 0.67578125, + "learning_rate": 9.260483870967742e-05, + "loss": 0.1643, + "step": 5086 + }, + { + "epoch": 0.081392, + "grad_norm": 1.0390625, + "learning_rate": 9.260322580645162e-05, + "loss": 0.1873, + "step": 5087 + }, + { + "epoch": 0.081408, + "grad_norm": 0.6015625, + "learning_rate": 9.260161290322582e-05, + "loss": 0.1639, + "step": 5088 + }, + { + "epoch": 0.081424, + "grad_norm": 0.70703125, + "learning_rate": 9.260000000000001e-05, + "loss": 0.1627, + "step": 5089 + }, + { + "epoch": 0.08144, + "grad_norm": 0.66796875, + "learning_rate": 9.25983870967742e-05, + "loss": 0.1887, + "step": 5090 + }, + { + "epoch": 0.081456, + "grad_norm": 0.52734375, + "learning_rate": 9.25967741935484e-05, + "loss": 0.169, + "step": 5091 + }, + { + "epoch": 0.081472, + "grad_norm": 1.046875, + "learning_rate": 9.259516129032259e-05, + "loss": 0.1732, + "step": 5092 + }, + { + "epoch": 0.081488, + "grad_norm": 0.64453125, + "learning_rate": 9.259354838709677e-05, + "loss": 0.1628, + "step": 5093 + }, + { + "epoch": 0.081504, + "grad_norm": 0.8984375, + "learning_rate": 9.259193548387097e-05, + "loss": 0.2083, + "step": 5094 + }, + { + "epoch": 0.08152, + "grad_norm": 0.5546875, + "learning_rate": 9.259032258064516e-05, + "loss": 0.1685, + "step": 5095 + }, + { + "epoch": 0.081536, + "grad_norm": 1.0078125, + "learning_rate": 9.258870967741936e-05, + "loss": 0.1314, + "step": 5096 + }, + { + "epoch": 0.081552, + "grad_norm": 0.74609375, + "learning_rate": 9.258709677419354e-05, + "loss": 0.1606, + "step": 5097 + }, + { + "epoch": 0.081568, + "grad_norm": 1.0546875, + "learning_rate": 9.258548387096774e-05, + "loss": 0.1551, + "step": 5098 + }, + { + "epoch": 0.081584, + "grad_norm": 0.796875, + "learning_rate": 9.258387096774194e-05, + "loss": 0.1893, + "step": 5099 + }, + { + "epoch": 0.0816, + "grad_norm": 1.28125, + "learning_rate": 9.258225806451614e-05, + "loss": 0.1856, + "step": 5100 + }, + { + "epoch": 0.081616, + "grad_norm": 1.421875, + "learning_rate": 9.258064516129033e-05, + "loss": 0.1644, + "step": 5101 + }, + { + "epoch": 0.081632, + "grad_norm": 0.83984375, + "learning_rate": 9.257903225806453e-05, + "loss": 0.1455, + "step": 5102 + }, + { + "epoch": 0.081648, + "grad_norm": 0.9140625, + "learning_rate": 9.257741935483871e-05, + "loss": 0.2002, + "step": 5103 + }, + { + "epoch": 0.081664, + "grad_norm": 0.58203125, + "learning_rate": 9.257580645161291e-05, + "loss": 0.1433, + "step": 5104 + }, + { + "epoch": 0.08168, + "grad_norm": 0.56640625, + "learning_rate": 9.25741935483871e-05, + "loss": 0.1713, + "step": 5105 + }, + { + "epoch": 0.081696, + "grad_norm": 0.86328125, + "learning_rate": 9.25725806451613e-05, + "loss": 0.1758, + "step": 5106 + }, + { + "epoch": 0.081712, + "grad_norm": 0.70703125, + "learning_rate": 9.257096774193549e-05, + "loss": 0.1704, + "step": 5107 + }, + { + "epoch": 0.081728, + "grad_norm": 1.4453125, + "learning_rate": 9.256935483870967e-05, + "loss": 0.2269, + "step": 5108 + }, + { + "epoch": 0.081744, + "grad_norm": 0.85546875, + "learning_rate": 9.256774193548387e-05, + "loss": 0.1791, + "step": 5109 + }, + { + "epoch": 0.08176, + "grad_norm": 0.8828125, + "learning_rate": 9.256612903225806e-05, + "loss": 0.1637, + "step": 5110 + }, + { + "epoch": 0.081776, + "grad_norm": 0.8359375, + "learning_rate": 9.256451612903226e-05, + "loss": 0.2018, + "step": 5111 + }, + { + "epoch": 0.081792, + "grad_norm": 1.125, + "learning_rate": 9.256290322580646e-05, + "loss": 0.1954, + "step": 5112 + }, + { + "epoch": 0.081808, + "grad_norm": 0.98046875, + "learning_rate": 9.256129032258066e-05, + "loss": 0.1583, + "step": 5113 + }, + { + "epoch": 0.081824, + "grad_norm": 0.828125, + "learning_rate": 9.255967741935484e-05, + "loss": 0.1585, + "step": 5114 + }, + { + "epoch": 0.08184, + "grad_norm": 1.0546875, + "learning_rate": 9.255806451612904e-05, + "loss": 0.2226, + "step": 5115 + }, + { + "epoch": 0.081856, + "grad_norm": 0.75390625, + "learning_rate": 9.255645161290323e-05, + "loss": 0.1869, + "step": 5116 + }, + { + "epoch": 0.081872, + "grad_norm": 0.9140625, + "learning_rate": 9.255483870967743e-05, + "loss": 0.1717, + "step": 5117 + }, + { + "epoch": 0.081888, + "grad_norm": 0.67578125, + "learning_rate": 9.255322580645161e-05, + "loss": 0.1446, + "step": 5118 + }, + { + "epoch": 0.081904, + "grad_norm": 1.0390625, + "learning_rate": 9.255161290322581e-05, + "loss": 0.2291, + "step": 5119 + }, + { + "epoch": 0.08192, + "grad_norm": 0.65625, + "learning_rate": 9.255e-05, + "loss": 0.156, + "step": 5120 + }, + { + "epoch": 0.081936, + "grad_norm": 0.9375, + "learning_rate": 9.25483870967742e-05, + "loss": 0.1885, + "step": 5121 + }, + { + "epoch": 0.081952, + "grad_norm": 1.0234375, + "learning_rate": 9.254677419354839e-05, + "loss": 0.1857, + "step": 5122 + }, + { + "epoch": 0.081968, + "grad_norm": 0.89453125, + "learning_rate": 9.254516129032259e-05, + "loss": 0.1997, + "step": 5123 + }, + { + "epoch": 0.081984, + "grad_norm": 0.55078125, + "learning_rate": 9.254354838709678e-05, + "loss": 0.1757, + "step": 5124 + }, + { + "epoch": 0.082, + "grad_norm": 0.8046875, + "learning_rate": 9.254193548387097e-05, + "loss": 0.147, + "step": 5125 + }, + { + "epoch": 0.082016, + "grad_norm": 0.57421875, + "learning_rate": 9.254032258064517e-05, + "loss": 0.1713, + "step": 5126 + }, + { + "epoch": 0.082032, + "grad_norm": 1.359375, + "learning_rate": 9.253870967741936e-05, + "loss": 0.174, + "step": 5127 + }, + { + "epoch": 0.082048, + "grad_norm": 1.0234375, + "learning_rate": 9.253709677419356e-05, + "loss": 0.1579, + "step": 5128 + }, + { + "epoch": 0.082064, + "grad_norm": 0.75, + "learning_rate": 9.253548387096774e-05, + "loss": 0.1692, + "step": 5129 + }, + { + "epoch": 0.08208, + "grad_norm": 1.125, + "learning_rate": 9.253387096774194e-05, + "loss": 0.1928, + "step": 5130 + }, + { + "epoch": 0.082096, + "grad_norm": 1.4921875, + "learning_rate": 9.253225806451613e-05, + "loss": 0.2348, + "step": 5131 + }, + { + "epoch": 0.082112, + "grad_norm": 0.62890625, + "learning_rate": 9.253064516129033e-05, + "loss": 0.1402, + "step": 5132 + }, + { + "epoch": 0.082128, + "grad_norm": 0.6015625, + "learning_rate": 9.252903225806451e-05, + "loss": 0.1456, + "step": 5133 + }, + { + "epoch": 0.082144, + "grad_norm": 0.984375, + "learning_rate": 9.252741935483871e-05, + "loss": 0.1737, + "step": 5134 + }, + { + "epoch": 0.08216, + "grad_norm": 1.1796875, + "learning_rate": 9.252580645161291e-05, + "loss": 0.1879, + "step": 5135 + }, + { + "epoch": 0.082176, + "grad_norm": 0.87109375, + "learning_rate": 9.25241935483871e-05, + "loss": 0.2016, + "step": 5136 + }, + { + "epoch": 0.082192, + "grad_norm": 0.78515625, + "learning_rate": 9.25225806451613e-05, + "loss": 0.2086, + "step": 5137 + }, + { + "epoch": 0.082208, + "grad_norm": 1.1484375, + "learning_rate": 9.25209677419355e-05, + "loss": 0.1683, + "step": 5138 + }, + { + "epoch": 0.082224, + "grad_norm": 0.8125, + "learning_rate": 9.251935483870968e-05, + "loss": 0.144, + "step": 5139 + }, + { + "epoch": 0.08224, + "grad_norm": 1.0625, + "learning_rate": 9.251774193548387e-05, + "loss": 0.2025, + "step": 5140 + }, + { + "epoch": 0.082256, + "grad_norm": 0.6953125, + "learning_rate": 9.251612903225807e-05, + "loss": 0.17, + "step": 5141 + }, + { + "epoch": 0.082272, + "grad_norm": 1.2421875, + "learning_rate": 9.251451612903226e-05, + "loss": 0.1548, + "step": 5142 + }, + { + "epoch": 0.082288, + "grad_norm": 0.96484375, + "learning_rate": 9.251290322580646e-05, + "loss": 0.189, + "step": 5143 + }, + { + "epoch": 0.082304, + "grad_norm": 0.5859375, + "learning_rate": 9.251129032258064e-05, + "loss": 0.1566, + "step": 5144 + }, + { + "epoch": 0.08232, + "grad_norm": 0.7578125, + "learning_rate": 9.250967741935484e-05, + "loss": 0.2086, + "step": 5145 + }, + { + "epoch": 0.082336, + "grad_norm": 1.1015625, + "learning_rate": 9.250806451612903e-05, + "loss": 0.1846, + "step": 5146 + }, + { + "epoch": 0.082352, + "grad_norm": 0.69140625, + "learning_rate": 9.250645161290323e-05, + "loss": 0.183, + "step": 5147 + }, + { + "epoch": 0.082368, + "grad_norm": 0.64453125, + "learning_rate": 9.250483870967743e-05, + "loss": 0.1621, + "step": 5148 + }, + { + "epoch": 0.082384, + "grad_norm": 0.99609375, + "learning_rate": 9.250322580645163e-05, + "loss": 0.205, + "step": 5149 + }, + { + "epoch": 0.0824, + "grad_norm": 0.80859375, + "learning_rate": 9.250161290322581e-05, + "loss": 0.1818, + "step": 5150 + }, + { + "epoch": 0.082416, + "grad_norm": 1.2734375, + "learning_rate": 9.250000000000001e-05, + "loss": 0.1515, + "step": 5151 + }, + { + "epoch": 0.082432, + "grad_norm": 0.6015625, + "learning_rate": 9.24983870967742e-05, + "loss": 0.1907, + "step": 5152 + }, + { + "epoch": 0.082448, + "grad_norm": 0.640625, + "learning_rate": 9.24967741935484e-05, + "loss": 0.1606, + "step": 5153 + }, + { + "epoch": 0.082464, + "grad_norm": 0.76171875, + "learning_rate": 9.249516129032258e-05, + "loss": 0.1464, + "step": 5154 + }, + { + "epoch": 0.08248, + "grad_norm": 0.89453125, + "learning_rate": 9.249354838709677e-05, + "loss": 0.1598, + "step": 5155 + }, + { + "epoch": 0.082496, + "grad_norm": 0.8203125, + "learning_rate": 9.249193548387097e-05, + "loss": 0.1703, + "step": 5156 + }, + { + "epoch": 0.082512, + "grad_norm": 0.98046875, + "learning_rate": 9.249032258064516e-05, + "loss": 0.2034, + "step": 5157 + }, + { + "epoch": 0.082528, + "grad_norm": 0.73828125, + "learning_rate": 9.248870967741936e-05, + "loss": 0.1784, + "step": 5158 + }, + { + "epoch": 0.082544, + "grad_norm": 0.7578125, + "learning_rate": 9.248709677419356e-05, + "loss": 0.1623, + "step": 5159 + }, + { + "epoch": 0.08256, + "grad_norm": 1.265625, + "learning_rate": 9.248548387096775e-05, + "loss": 0.2208, + "step": 5160 + }, + { + "epoch": 0.082576, + "grad_norm": 0.93359375, + "learning_rate": 9.248387096774194e-05, + "loss": 0.137, + "step": 5161 + }, + { + "epoch": 0.082592, + "grad_norm": 0.96875, + "learning_rate": 9.248225806451614e-05, + "loss": 0.1999, + "step": 5162 + }, + { + "epoch": 0.082608, + "grad_norm": 0.796875, + "learning_rate": 9.248064516129033e-05, + "loss": 0.1482, + "step": 5163 + }, + { + "epoch": 0.082624, + "grad_norm": 1.3515625, + "learning_rate": 9.247903225806453e-05, + "loss": 0.1511, + "step": 5164 + }, + { + "epoch": 0.08264, + "grad_norm": 0.8828125, + "learning_rate": 9.247741935483871e-05, + "loss": 0.1774, + "step": 5165 + }, + { + "epoch": 0.082656, + "grad_norm": 0.87890625, + "learning_rate": 9.247580645161291e-05, + "loss": 0.1954, + "step": 5166 + }, + { + "epoch": 0.082672, + "grad_norm": 1.2109375, + "learning_rate": 9.24741935483871e-05, + "loss": 0.2368, + "step": 5167 + }, + { + "epoch": 0.082688, + "grad_norm": 1.1484375, + "learning_rate": 9.24725806451613e-05, + "loss": 0.241, + "step": 5168 + }, + { + "epoch": 0.082704, + "grad_norm": 0.8515625, + "learning_rate": 9.247096774193548e-05, + "loss": 0.1566, + "step": 5169 + }, + { + "epoch": 0.08272, + "grad_norm": 0.65234375, + "learning_rate": 9.246935483870967e-05, + "loss": 0.1735, + "step": 5170 + }, + { + "epoch": 0.082736, + "grad_norm": 0.9921875, + "learning_rate": 9.246774193548387e-05, + "loss": 0.1879, + "step": 5171 + }, + { + "epoch": 0.082752, + "grad_norm": 0.51171875, + "learning_rate": 9.246612903225807e-05, + "loss": 0.1613, + "step": 5172 + }, + { + "epoch": 0.082768, + "grad_norm": 0.91015625, + "learning_rate": 9.246451612903227e-05, + "loss": 0.2303, + "step": 5173 + }, + { + "epoch": 0.082784, + "grad_norm": 0.8671875, + "learning_rate": 9.246290322580645e-05, + "loss": 0.1718, + "step": 5174 + }, + { + "epoch": 0.0828, + "grad_norm": 0.66015625, + "learning_rate": 9.246129032258065e-05, + "loss": 0.1904, + "step": 5175 + }, + { + "epoch": 0.082816, + "grad_norm": 0.68359375, + "learning_rate": 9.245967741935484e-05, + "loss": 0.1393, + "step": 5176 + }, + { + "epoch": 0.082832, + "grad_norm": 0.55078125, + "learning_rate": 9.245806451612904e-05, + "loss": 0.1485, + "step": 5177 + }, + { + "epoch": 0.082848, + "grad_norm": 0.70703125, + "learning_rate": 9.245645161290323e-05, + "loss": 0.2354, + "step": 5178 + }, + { + "epoch": 0.082864, + "grad_norm": 0.98828125, + "learning_rate": 9.245483870967743e-05, + "loss": 0.16, + "step": 5179 + }, + { + "epoch": 0.08288, + "grad_norm": 0.62890625, + "learning_rate": 9.245322580645161e-05, + "loss": 0.1574, + "step": 5180 + }, + { + "epoch": 0.082896, + "grad_norm": 0.6328125, + "learning_rate": 9.245161290322581e-05, + "loss": 0.1637, + "step": 5181 + }, + { + "epoch": 0.082912, + "grad_norm": 0.63671875, + "learning_rate": 9.245e-05, + "loss": 0.2131, + "step": 5182 + }, + { + "epoch": 0.082928, + "grad_norm": 0.8046875, + "learning_rate": 9.24483870967742e-05, + "loss": 0.143, + "step": 5183 + }, + { + "epoch": 0.082944, + "grad_norm": 1.203125, + "learning_rate": 9.24467741935484e-05, + "loss": 0.2292, + "step": 5184 + }, + { + "epoch": 0.08296, + "grad_norm": 0.6953125, + "learning_rate": 9.24451612903226e-05, + "loss": 0.1658, + "step": 5185 + }, + { + "epoch": 0.082976, + "grad_norm": 1.046875, + "learning_rate": 9.244354838709678e-05, + "loss": 0.1727, + "step": 5186 + }, + { + "epoch": 0.082992, + "grad_norm": 0.8046875, + "learning_rate": 9.244193548387097e-05, + "loss": 0.1554, + "step": 5187 + }, + { + "epoch": 0.083008, + "grad_norm": 0.81640625, + "learning_rate": 9.244032258064517e-05, + "loss": 0.1887, + "step": 5188 + }, + { + "epoch": 0.083024, + "grad_norm": 0.73828125, + "learning_rate": 9.243870967741935e-05, + "loss": 0.1615, + "step": 5189 + }, + { + "epoch": 0.08304, + "grad_norm": 1.3515625, + "learning_rate": 9.243709677419355e-05, + "loss": 0.1925, + "step": 5190 + }, + { + "epoch": 0.083056, + "grad_norm": 0.96875, + "learning_rate": 9.243548387096774e-05, + "loss": 0.2101, + "step": 5191 + }, + { + "epoch": 0.083072, + "grad_norm": 0.81640625, + "learning_rate": 9.243387096774194e-05, + "loss": 0.1823, + "step": 5192 + }, + { + "epoch": 0.083088, + "grad_norm": 0.98828125, + "learning_rate": 9.243225806451613e-05, + "loss": 0.1825, + "step": 5193 + }, + { + "epoch": 0.083104, + "grad_norm": 1.0234375, + "learning_rate": 9.243064516129033e-05, + "loss": 0.1625, + "step": 5194 + }, + { + "epoch": 0.08312, + "grad_norm": 0.498046875, + "learning_rate": 9.242903225806452e-05, + "loss": 0.1457, + "step": 5195 + }, + { + "epoch": 0.083136, + "grad_norm": 0.828125, + "learning_rate": 9.242741935483872e-05, + "loss": 0.1514, + "step": 5196 + }, + { + "epoch": 0.083152, + "grad_norm": 0.921875, + "learning_rate": 9.242580645161291e-05, + "loss": 0.1981, + "step": 5197 + }, + { + "epoch": 0.083168, + "grad_norm": 0.9921875, + "learning_rate": 9.242419354838711e-05, + "loss": 0.1832, + "step": 5198 + }, + { + "epoch": 0.083184, + "grad_norm": 1.2265625, + "learning_rate": 9.24225806451613e-05, + "loss": 0.1969, + "step": 5199 + }, + { + "epoch": 0.0832, + "grad_norm": 0.71875, + "learning_rate": 9.24209677419355e-05, + "loss": 0.1681, + "step": 5200 + }, + { + "epoch": 0.083216, + "grad_norm": 1.21875, + "learning_rate": 9.241935483870968e-05, + "loss": 0.1819, + "step": 5201 + }, + { + "epoch": 0.083232, + "grad_norm": 1.1953125, + "learning_rate": 9.241774193548387e-05, + "loss": 0.196, + "step": 5202 + }, + { + "epoch": 0.083248, + "grad_norm": 0.9765625, + "learning_rate": 9.241612903225807e-05, + "loss": 0.1875, + "step": 5203 + }, + { + "epoch": 0.083264, + "grad_norm": 0.76953125, + "learning_rate": 9.241451612903225e-05, + "loss": 0.1317, + "step": 5204 + }, + { + "epoch": 0.08328, + "grad_norm": 0.57421875, + "learning_rate": 9.241290322580645e-05, + "loss": 0.1764, + "step": 5205 + }, + { + "epoch": 0.083296, + "grad_norm": 0.94921875, + "learning_rate": 9.241129032258064e-05, + "loss": 0.1736, + "step": 5206 + }, + { + "epoch": 0.083312, + "grad_norm": 1.265625, + "learning_rate": 9.240967741935484e-05, + "loss": 0.2217, + "step": 5207 + }, + { + "epoch": 0.083328, + "grad_norm": 0.7265625, + "learning_rate": 9.240806451612904e-05, + "loss": 0.1705, + "step": 5208 + }, + { + "epoch": 0.083344, + "grad_norm": 0.97265625, + "learning_rate": 9.240645161290324e-05, + "loss": 0.1325, + "step": 5209 + }, + { + "epoch": 0.08336, + "grad_norm": 0.75390625, + "learning_rate": 9.240483870967742e-05, + "loss": 0.2074, + "step": 5210 + }, + { + "epoch": 0.083376, + "grad_norm": 0.71484375, + "learning_rate": 9.240322580645162e-05, + "loss": 0.186, + "step": 5211 + }, + { + "epoch": 0.083392, + "grad_norm": 0.7734375, + "learning_rate": 9.240161290322581e-05, + "loss": 0.1631, + "step": 5212 + }, + { + "epoch": 0.083408, + "grad_norm": 2.078125, + "learning_rate": 9.240000000000001e-05, + "loss": 0.2267, + "step": 5213 + }, + { + "epoch": 0.083424, + "grad_norm": 1.1484375, + "learning_rate": 9.23983870967742e-05, + "loss": 0.1954, + "step": 5214 + }, + { + "epoch": 0.08344, + "grad_norm": 0.73828125, + "learning_rate": 9.23967741935484e-05, + "loss": 0.223, + "step": 5215 + }, + { + "epoch": 0.083456, + "grad_norm": 1.015625, + "learning_rate": 9.239516129032258e-05, + "loss": 0.2042, + "step": 5216 + }, + { + "epoch": 0.083472, + "grad_norm": 1.6015625, + "learning_rate": 9.239354838709677e-05, + "loss": 0.2199, + "step": 5217 + }, + { + "epoch": 0.083488, + "grad_norm": 0.8515625, + "learning_rate": 9.239193548387097e-05, + "loss": 0.1632, + "step": 5218 + }, + { + "epoch": 0.083504, + "grad_norm": 0.51953125, + "learning_rate": 9.239032258064517e-05, + "loss": 0.1578, + "step": 5219 + }, + { + "epoch": 0.08352, + "grad_norm": 0.921875, + "learning_rate": 9.238870967741937e-05, + "loss": 0.1931, + "step": 5220 + }, + { + "epoch": 0.083536, + "grad_norm": 0.73828125, + "learning_rate": 9.238709677419355e-05, + "loss": 0.1557, + "step": 5221 + }, + { + "epoch": 0.083552, + "grad_norm": 0.98828125, + "learning_rate": 9.238548387096775e-05, + "loss": 0.2017, + "step": 5222 + }, + { + "epoch": 0.083568, + "grad_norm": 0.78515625, + "learning_rate": 9.238387096774194e-05, + "loss": 0.1503, + "step": 5223 + }, + { + "epoch": 0.083584, + "grad_norm": 1.328125, + "learning_rate": 9.238225806451614e-05, + "loss": 0.18, + "step": 5224 + }, + { + "epoch": 0.0836, + "grad_norm": 1.5625, + "learning_rate": 9.238064516129032e-05, + "loss": 0.1843, + "step": 5225 + }, + { + "epoch": 0.083616, + "grad_norm": 0.609375, + "learning_rate": 9.237903225806452e-05, + "loss": 0.1635, + "step": 5226 + }, + { + "epoch": 0.083632, + "grad_norm": 0.8984375, + "learning_rate": 9.237741935483871e-05, + "loss": 0.2143, + "step": 5227 + }, + { + "epoch": 0.083648, + "grad_norm": 0.76953125, + "learning_rate": 9.237580645161291e-05, + "loss": 0.232, + "step": 5228 + }, + { + "epoch": 0.083664, + "grad_norm": 0.79296875, + "learning_rate": 9.23741935483871e-05, + "loss": 0.2089, + "step": 5229 + }, + { + "epoch": 0.08368, + "grad_norm": 0.79296875, + "learning_rate": 9.23725806451613e-05, + "loss": 0.2006, + "step": 5230 + }, + { + "epoch": 0.083696, + "grad_norm": 1.015625, + "learning_rate": 9.23709677419355e-05, + "loss": 0.2017, + "step": 5231 + }, + { + "epoch": 0.083712, + "grad_norm": 0.625, + "learning_rate": 9.236935483870968e-05, + "loss": 0.1769, + "step": 5232 + }, + { + "epoch": 0.083728, + "grad_norm": 0.765625, + "learning_rate": 9.236774193548388e-05, + "loss": 0.175, + "step": 5233 + }, + { + "epoch": 0.083744, + "grad_norm": 0.58203125, + "learning_rate": 9.236612903225807e-05, + "loss": 0.1714, + "step": 5234 + }, + { + "epoch": 0.08376, + "grad_norm": 0.7109375, + "learning_rate": 9.236451612903227e-05, + "loss": 0.2098, + "step": 5235 + }, + { + "epoch": 0.083776, + "grad_norm": 0.9140625, + "learning_rate": 9.236290322580645e-05, + "loss": 0.1962, + "step": 5236 + }, + { + "epoch": 0.083792, + "grad_norm": 0.734375, + "learning_rate": 9.236129032258065e-05, + "loss": 0.1613, + "step": 5237 + }, + { + "epoch": 0.083808, + "grad_norm": 0.765625, + "learning_rate": 9.235967741935484e-05, + "loss": 0.1755, + "step": 5238 + }, + { + "epoch": 0.083824, + "grad_norm": 1.0, + "learning_rate": 9.235806451612904e-05, + "loss": 0.2068, + "step": 5239 + }, + { + "epoch": 0.08384, + "grad_norm": 0.6328125, + "learning_rate": 9.235645161290322e-05, + "loss": 0.1891, + "step": 5240 + }, + { + "epoch": 0.083856, + "grad_norm": 0.765625, + "learning_rate": 9.235483870967742e-05, + "loss": 0.146, + "step": 5241 + }, + { + "epoch": 0.083872, + "grad_norm": 0.62890625, + "learning_rate": 9.235322580645161e-05, + "loss": 0.1697, + "step": 5242 + }, + { + "epoch": 0.083888, + "grad_norm": 0.59765625, + "learning_rate": 9.235161290322581e-05, + "loss": 0.1652, + "step": 5243 + }, + { + "epoch": 0.083904, + "grad_norm": 0.9296875, + "learning_rate": 9.235000000000001e-05, + "loss": 0.1712, + "step": 5244 + }, + { + "epoch": 0.08392, + "grad_norm": 0.921875, + "learning_rate": 9.234838709677421e-05, + "loss": 0.1611, + "step": 5245 + }, + { + "epoch": 0.083936, + "grad_norm": 0.73828125, + "learning_rate": 9.23467741935484e-05, + "loss": 0.152, + "step": 5246 + }, + { + "epoch": 0.083952, + "grad_norm": 0.90234375, + "learning_rate": 9.23451612903226e-05, + "loss": 0.1894, + "step": 5247 + }, + { + "epoch": 0.083968, + "grad_norm": 1.0859375, + "learning_rate": 9.234354838709678e-05, + "loss": 0.2488, + "step": 5248 + }, + { + "epoch": 0.083984, + "grad_norm": 1.40625, + "learning_rate": 9.234193548387097e-05, + "loss": 0.1895, + "step": 5249 + }, + { + "epoch": 0.084, + "grad_norm": 0.8203125, + "learning_rate": 9.234032258064517e-05, + "loss": 0.1543, + "step": 5250 + }, + { + "epoch": 0.084016, + "grad_norm": 0.8046875, + "learning_rate": 9.233870967741935e-05, + "loss": 0.2311, + "step": 5251 + }, + { + "epoch": 0.084032, + "grad_norm": 1.046875, + "learning_rate": 9.233709677419355e-05, + "loss": 0.1969, + "step": 5252 + }, + { + "epoch": 0.084048, + "grad_norm": 0.75, + "learning_rate": 9.233548387096774e-05, + "loss": 0.2016, + "step": 5253 + }, + { + "epoch": 0.084064, + "grad_norm": 1.1328125, + "learning_rate": 9.233387096774194e-05, + "loss": 0.1595, + "step": 5254 + }, + { + "epoch": 0.08408, + "grad_norm": 0.71484375, + "learning_rate": 9.233225806451614e-05, + "loss": 0.1909, + "step": 5255 + }, + { + "epoch": 0.084096, + "grad_norm": 0.68359375, + "learning_rate": 9.233064516129034e-05, + "loss": 0.1363, + "step": 5256 + }, + { + "epoch": 0.084112, + "grad_norm": 0.52734375, + "learning_rate": 9.232903225806452e-05, + "loss": 0.187, + "step": 5257 + }, + { + "epoch": 0.084128, + "grad_norm": 0.78125, + "learning_rate": 9.232741935483872e-05, + "loss": 0.1848, + "step": 5258 + }, + { + "epoch": 0.084144, + "grad_norm": 0.515625, + "learning_rate": 9.232580645161291e-05, + "loss": 0.1438, + "step": 5259 + }, + { + "epoch": 0.08416, + "grad_norm": 0.73828125, + "learning_rate": 9.232419354838711e-05, + "loss": 0.1548, + "step": 5260 + }, + { + "epoch": 0.084176, + "grad_norm": 0.66015625, + "learning_rate": 9.23225806451613e-05, + "loss": 0.1514, + "step": 5261 + }, + { + "epoch": 0.084192, + "grad_norm": 1.1953125, + "learning_rate": 9.23209677419355e-05, + "loss": 0.2122, + "step": 5262 + }, + { + "epoch": 0.084208, + "grad_norm": 0.6171875, + "learning_rate": 9.231935483870968e-05, + "loss": 0.1326, + "step": 5263 + }, + { + "epoch": 0.084224, + "grad_norm": 1.0390625, + "learning_rate": 9.231774193548387e-05, + "loss": 0.2082, + "step": 5264 + }, + { + "epoch": 0.08424, + "grad_norm": 1.578125, + "learning_rate": 9.231612903225807e-05, + "loss": 0.1698, + "step": 5265 + }, + { + "epoch": 0.084256, + "grad_norm": 1.0625, + "learning_rate": 9.231451612903225e-05, + "loss": 0.169, + "step": 5266 + }, + { + "epoch": 0.084272, + "grad_norm": 0.63671875, + "learning_rate": 9.231290322580645e-05, + "loss": 0.1732, + "step": 5267 + }, + { + "epoch": 0.084288, + "grad_norm": 0.8359375, + "learning_rate": 9.231129032258065e-05, + "loss": 0.2071, + "step": 5268 + }, + { + "epoch": 0.084304, + "grad_norm": 0.7421875, + "learning_rate": 9.230967741935485e-05, + "loss": 0.1432, + "step": 5269 + }, + { + "epoch": 0.08432, + "grad_norm": 0.78125, + "learning_rate": 9.230806451612904e-05, + "loss": 0.2179, + "step": 5270 + }, + { + "epoch": 0.084336, + "grad_norm": 1.390625, + "learning_rate": 9.230645161290324e-05, + "loss": 0.181, + "step": 5271 + }, + { + "epoch": 0.084352, + "grad_norm": 0.94140625, + "learning_rate": 9.230483870967742e-05, + "loss": 0.1814, + "step": 5272 + }, + { + "epoch": 0.084368, + "grad_norm": 0.474609375, + "learning_rate": 9.230322580645162e-05, + "loss": 0.1324, + "step": 5273 + }, + { + "epoch": 0.084384, + "grad_norm": 1.109375, + "learning_rate": 9.230161290322581e-05, + "loss": 0.2218, + "step": 5274 + }, + { + "epoch": 0.0844, + "grad_norm": 0.72265625, + "learning_rate": 9.230000000000001e-05, + "loss": 0.1516, + "step": 5275 + }, + { + "epoch": 0.084416, + "grad_norm": 0.79296875, + "learning_rate": 9.22983870967742e-05, + "loss": 0.2019, + "step": 5276 + }, + { + "epoch": 0.084432, + "grad_norm": 0.86328125, + "learning_rate": 9.229677419354839e-05, + "loss": 0.1901, + "step": 5277 + }, + { + "epoch": 0.084448, + "grad_norm": 0.8046875, + "learning_rate": 9.229516129032258e-05, + "loss": 0.1772, + "step": 5278 + }, + { + "epoch": 0.084464, + "grad_norm": 0.6640625, + "learning_rate": 9.229354838709678e-05, + "loss": 0.1652, + "step": 5279 + }, + { + "epoch": 0.08448, + "grad_norm": 0.7265625, + "learning_rate": 9.229193548387098e-05, + "loss": 0.1937, + "step": 5280 + }, + { + "epoch": 0.084496, + "grad_norm": 0.74609375, + "learning_rate": 9.229032258064516e-05, + "loss": 0.1587, + "step": 5281 + }, + { + "epoch": 0.084512, + "grad_norm": 0.81640625, + "learning_rate": 9.228870967741936e-05, + "loss": 0.2084, + "step": 5282 + }, + { + "epoch": 0.084528, + "grad_norm": 0.71484375, + "learning_rate": 9.228709677419355e-05, + "loss": 0.1445, + "step": 5283 + }, + { + "epoch": 0.084544, + "grad_norm": 0.83203125, + "learning_rate": 9.228548387096775e-05, + "loss": 0.1874, + "step": 5284 + }, + { + "epoch": 0.08456, + "grad_norm": 0.67578125, + "learning_rate": 9.228387096774194e-05, + "loss": 0.1554, + "step": 5285 + }, + { + "epoch": 0.084576, + "grad_norm": 0.60546875, + "learning_rate": 9.228225806451614e-05, + "loss": 0.1481, + "step": 5286 + }, + { + "epoch": 0.084592, + "grad_norm": 0.78125, + "learning_rate": 9.228064516129032e-05, + "loss": 0.1555, + "step": 5287 + }, + { + "epoch": 0.084608, + "grad_norm": 0.8828125, + "learning_rate": 9.227903225806452e-05, + "loss": 0.2065, + "step": 5288 + }, + { + "epoch": 0.084624, + "grad_norm": 0.56640625, + "learning_rate": 9.227741935483871e-05, + "loss": 0.1268, + "step": 5289 + }, + { + "epoch": 0.08464, + "grad_norm": 0.8671875, + "learning_rate": 9.227580645161291e-05, + "loss": 0.1992, + "step": 5290 + }, + { + "epoch": 0.084656, + "grad_norm": 0.56640625, + "learning_rate": 9.22741935483871e-05, + "loss": 0.1451, + "step": 5291 + }, + { + "epoch": 0.084672, + "grad_norm": 0.5546875, + "learning_rate": 9.22725806451613e-05, + "loss": 0.1681, + "step": 5292 + }, + { + "epoch": 0.084688, + "grad_norm": 0.69140625, + "learning_rate": 9.227096774193549e-05, + "loss": 0.1304, + "step": 5293 + }, + { + "epoch": 0.084704, + "grad_norm": 0.91796875, + "learning_rate": 9.226935483870969e-05, + "loss": 0.1576, + "step": 5294 + }, + { + "epoch": 0.08472, + "grad_norm": 0.70703125, + "learning_rate": 9.226774193548388e-05, + "loss": 0.226, + "step": 5295 + }, + { + "epoch": 0.084736, + "grad_norm": 0.73828125, + "learning_rate": 9.226612903225806e-05, + "loss": 0.1563, + "step": 5296 + }, + { + "epoch": 0.084752, + "grad_norm": 0.875, + "learning_rate": 9.226451612903226e-05, + "loss": 0.1814, + "step": 5297 + }, + { + "epoch": 0.084768, + "grad_norm": 1.1328125, + "learning_rate": 9.226290322580645e-05, + "loss": 0.1878, + "step": 5298 + }, + { + "epoch": 0.084784, + "grad_norm": 0.921875, + "learning_rate": 9.226129032258065e-05, + "loss": 0.2141, + "step": 5299 + }, + { + "epoch": 0.0848, + "grad_norm": 0.6953125, + "learning_rate": 9.225967741935484e-05, + "loss": 0.2118, + "step": 5300 + }, + { + "epoch": 0.084816, + "grad_norm": 2.1875, + "learning_rate": 9.225806451612904e-05, + "loss": 0.1981, + "step": 5301 + }, + { + "epoch": 0.084832, + "grad_norm": 0.7265625, + "learning_rate": 9.225645161290322e-05, + "loss": 0.1811, + "step": 5302 + }, + { + "epoch": 0.084848, + "grad_norm": 0.69140625, + "learning_rate": 9.225483870967742e-05, + "loss": 0.1754, + "step": 5303 + }, + { + "epoch": 0.084864, + "grad_norm": 0.62890625, + "learning_rate": 9.225322580645162e-05, + "loss": 0.146, + "step": 5304 + }, + { + "epoch": 0.08488, + "grad_norm": 0.578125, + "learning_rate": 9.225161290322582e-05, + "loss": 0.1887, + "step": 5305 + }, + { + "epoch": 0.084896, + "grad_norm": 1.078125, + "learning_rate": 9.225e-05, + "loss": 0.1676, + "step": 5306 + }, + { + "epoch": 0.084912, + "grad_norm": 0.7890625, + "learning_rate": 9.22483870967742e-05, + "loss": 0.1843, + "step": 5307 + }, + { + "epoch": 0.084928, + "grad_norm": 1.140625, + "learning_rate": 9.224677419354839e-05, + "loss": 0.1967, + "step": 5308 + }, + { + "epoch": 0.084944, + "grad_norm": 0.6640625, + "learning_rate": 9.224516129032259e-05, + "loss": 0.1881, + "step": 5309 + }, + { + "epoch": 0.08496, + "grad_norm": 0.86328125, + "learning_rate": 9.224354838709678e-05, + "loss": 0.1845, + "step": 5310 + }, + { + "epoch": 0.084976, + "grad_norm": 0.68359375, + "learning_rate": 9.224193548387096e-05, + "loss": 0.1462, + "step": 5311 + }, + { + "epoch": 0.084992, + "grad_norm": 0.9609375, + "learning_rate": 9.224032258064516e-05, + "loss": 0.1977, + "step": 5312 + }, + { + "epoch": 0.085008, + "grad_norm": 0.64453125, + "learning_rate": 9.223870967741935e-05, + "loss": 0.194, + "step": 5313 + }, + { + "epoch": 0.085024, + "grad_norm": 0.82421875, + "learning_rate": 9.223709677419355e-05, + "loss": 0.1573, + "step": 5314 + }, + { + "epoch": 0.08504, + "grad_norm": 0.70703125, + "learning_rate": 9.223548387096775e-05, + "loss": 0.1838, + "step": 5315 + }, + { + "epoch": 0.085056, + "grad_norm": 0.70703125, + "learning_rate": 9.223387096774195e-05, + "loss": 0.1599, + "step": 5316 + }, + { + "epoch": 0.085072, + "grad_norm": 1.2109375, + "learning_rate": 9.223225806451613e-05, + "loss": 0.2132, + "step": 5317 + }, + { + "epoch": 0.085088, + "grad_norm": 0.5703125, + "learning_rate": 9.223064516129033e-05, + "loss": 0.1709, + "step": 5318 + }, + { + "epoch": 0.085104, + "grad_norm": 0.76953125, + "learning_rate": 9.222903225806452e-05, + "loss": 0.1871, + "step": 5319 + }, + { + "epoch": 0.08512, + "grad_norm": 0.6015625, + "learning_rate": 9.222741935483872e-05, + "loss": 0.18, + "step": 5320 + }, + { + "epoch": 0.085136, + "grad_norm": 0.77734375, + "learning_rate": 9.22258064516129e-05, + "loss": 0.2193, + "step": 5321 + }, + { + "epoch": 0.085152, + "grad_norm": 1.09375, + "learning_rate": 9.22241935483871e-05, + "loss": 0.2004, + "step": 5322 + }, + { + "epoch": 0.085168, + "grad_norm": 0.70703125, + "learning_rate": 9.222258064516129e-05, + "loss": 0.2091, + "step": 5323 + }, + { + "epoch": 0.085184, + "grad_norm": 0.5703125, + "learning_rate": 9.222096774193549e-05, + "loss": 0.1724, + "step": 5324 + }, + { + "epoch": 0.0852, + "grad_norm": 0.7265625, + "learning_rate": 9.221935483870968e-05, + "loss": 0.1822, + "step": 5325 + }, + { + "epoch": 0.085216, + "grad_norm": 0.8046875, + "learning_rate": 9.221774193548388e-05, + "loss": 0.1791, + "step": 5326 + }, + { + "epoch": 0.085232, + "grad_norm": 1.0234375, + "learning_rate": 9.221612903225806e-05, + "loss": 0.1761, + "step": 5327 + }, + { + "epoch": 0.085248, + "grad_norm": 0.6796875, + "learning_rate": 9.221451612903226e-05, + "loss": 0.178, + "step": 5328 + }, + { + "epoch": 0.085264, + "grad_norm": 0.9296875, + "learning_rate": 9.221290322580646e-05, + "loss": 0.1825, + "step": 5329 + }, + { + "epoch": 0.08528, + "grad_norm": 0.55078125, + "learning_rate": 9.221129032258065e-05, + "loss": 0.1492, + "step": 5330 + }, + { + "epoch": 0.085296, + "grad_norm": 0.625, + "learning_rate": 9.220967741935485e-05, + "loss": 0.1705, + "step": 5331 + }, + { + "epoch": 0.085312, + "grad_norm": 0.9921875, + "learning_rate": 9.220806451612903e-05, + "loss": 0.1488, + "step": 5332 + }, + { + "epoch": 0.085328, + "grad_norm": 0.50390625, + "learning_rate": 9.220645161290323e-05, + "loss": 0.2016, + "step": 5333 + }, + { + "epoch": 0.085344, + "grad_norm": 0.82421875, + "learning_rate": 9.220483870967742e-05, + "loss": 0.2205, + "step": 5334 + }, + { + "epoch": 0.08536, + "grad_norm": 0.66796875, + "learning_rate": 9.220322580645162e-05, + "loss": 0.1718, + "step": 5335 + }, + { + "epoch": 0.085376, + "grad_norm": 0.64453125, + "learning_rate": 9.22016129032258e-05, + "loss": 0.1552, + "step": 5336 + }, + { + "epoch": 0.085392, + "grad_norm": 0.7734375, + "learning_rate": 9.22e-05, + "loss": 0.2011, + "step": 5337 + }, + { + "epoch": 0.085408, + "grad_norm": 0.953125, + "learning_rate": 9.219838709677419e-05, + "loss": 0.1867, + "step": 5338 + }, + { + "epoch": 0.085424, + "grad_norm": 0.74609375, + "learning_rate": 9.219677419354839e-05, + "loss": 0.1818, + "step": 5339 + }, + { + "epoch": 0.08544, + "grad_norm": 0.98046875, + "learning_rate": 9.219516129032259e-05, + "loss": 0.173, + "step": 5340 + }, + { + "epoch": 0.085456, + "grad_norm": 1.0078125, + "learning_rate": 9.219354838709678e-05, + "loss": 0.2133, + "step": 5341 + }, + { + "epoch": 0.085472, + "grad_norm": 0.81640625, + "learning_rate": 9.219193548387098e-05, + "loss": 0.1597, + "step": 5342 + }, + { + "epoch": 0.085488, + "grad_norm": 0.8984375, + "learning_rate": 9.219032258064516e-05, + "loss": 0.1432, + "step": 5343 + }, + { + "epoch": 0.085504, + "grad_norm": 1.078125, + "learning_rate": 9.218870967741936e-05, + "loss": 0.2004, + "step": 5344 + }, + { + "epoch": 0.08552, + "grad_norm": 1.0, + "learning_rate": 9.218709677419355e-05, + "loss": 0.1699, + "step": 5345 + }, + { + "epoch": 0.085536, + "grad_norm": 0.62109375, + "learning_rate": 9.218548387096775e-05, + "loss": 0.1825, + "step": 5346 + }, + { + "epoch": 0.085552, + "grad_norm": 0.98046875, + "learning_rate": 9.218387096774193e-05, + "loss": 0.2017, + "step": 5347 + }, + { + "epoch": 0.085568, + "grad_norm": 0.84765625, + "learning_rate": 9.218225806451613e-05, + "loss": 0.1743, + "step": 5348 + }, + { + "epoch": 0.085584, + "grad_norm": 0.859375, + "learning_rate": 9.218064516129032e-05, + "loss": 0.1523, + "step": 5349 + }, + { + "epoch": 0.0856, + "grad_norm": 1.28125, + "learning_rate": 9.217903225806452e-05, + "loss": 0.1971, + "step": 5350 + }, + { + "epoch": 0.085616, + "grad_norm": 0.7109375, + "learning_rate": 9.217741935483872e-05, + "loss": 0.1637, + "step": 5351 + }, + { + "epoch": 0.085632, + "grad_norm": 0.55859375, + "learning_rate": 9.217580645161292e-05, + "loss": 0.1641, + "step": 5352 + }, + { + "epoch": 0.085648, + "grad_norm": 0.95703125, + "learning_rate": 9.21741935483871e-05, + "loss": 0.197, + "step": 5353 + }, + { + "epoch": 0.085664, + "grad_norm": 0.82421875, + "learning_rate": 9.21725806451613e-05, + "loss": 0.1674, + "step": 5354 + }, + { + "epoch": 0.08568, + "grad_norm": 0.73828125, + "learning_rate": 9.217096774193549e-05, + "loss": 0.172, + "step": 5355 + }, + { + "epoch": 0.085696, + "grad_norm": 0.8125, + "learning_rate": 9.216935483870969e-05, + "loss": 0.1769, + "step": 5356 + }, + { + "epoch": 0.085712, + "grad_norm": 0.73828125, + "learning_rate": 9.216774193548388e-05, + "loss": 0.1659, + "step": 5357 + }, + { + "epoch": 0.085728, + "grad_norm": 0.78125, + "learning_rate": 9.216612903225806e-05, + "loss": 0.2006, + "step": 5358 + }, + { + "epoch": 0.085744, + "grad_norm": 0.7734375, + "learning_rate": 9.216451612903226e-05, + "loss": 0.1699, + "step": 5359 + }, + { + "epoch": 0.08576, + "grad_norm": 0.91015625, + "learning_rate": 9.216290322580645e-05, + "loss": 0.1497, + "step": 5360 + }, + { + "epoch": 0.085776, + "grad_norm": 0.89453125, + "learning_rate": 9.216129032258065e-05, + "loss": 0.1897, + "step": 5361 + }, + { + "epoch": 0.085792, + "grad_norm": 0.81640625, + "learning_rate": 9.215967741935483e-05, + "loss": 0.1911, + "step": 5362 + }, + { + "epoch": 0.085808, + "grad_norm": 0.70703125, + "learning_rate": 9.215806451612903e-05, + "loss": 0.1534, + "step": 5363 + }, + { + "epoch": 0.085824, + "grad_norm": 0.84765625, + "learning_rate": 9.215645161290323e-05, + "loss": 0.1727, + "step": 5364 + }, + { + "epoch": 0.08584, + "grad_norm": 0.52734375, + "learning_rate": 9.215483870967743e-05, + "loss": 0.1404, + "step": 5365 + }, + { + "epoch": 0.085856, + "grad_norm": 0.447265625, + "learning_rate": 9.215322580645162e-05, + "loss": 0.1529, + "step": 5366 + }, + { + "epoch": 0.085872, + "grad_norm": 0.91015625, + "learning_rate": 9.215161290322582e-05, + "loss": 0.1901, + "step": 5367 + }, + { + "epoch": 0.085888, + "grad_norm": 0.7734375, + "learning_rate": 9.215e-05, + "loss": 0.1942, + "step": 5368 + }, + { + "epoch": 0.085904, + "grad_norm": 0.65625, + "learning_rate": 9.21483870967742e-05, + "loss": 0.1801, + "step": 5369 + }, + { + "epoch": 0.08592, + "grad_norm": 0.6796875, + "learning_rate": 9.214677419354839e-05, + "loss": 0.17, + "step": 5370 + }, + { + "epoch": 0.085936, + "grad_norm": 1.28125, + "learning_rate": 9.214516129032259e-05, + "loss": 0.2352, + "step": 5371 + }, + { + "epoch": 0.085952, + "grad_norm": 0.69140625, + "learning_rate": 9.214354838709678e-05, + "loss": 0.1614, + "step": 5372 + }, + { + "epoch": 0.085968, + "grad_norm": 0.6328125, + "learning_rate": 9.214193548387096e-05, + "loss": 0.1519, + "step": 5373 + }, + { + "epoch": 0.085984, + "grad_norm": 0.6328125, + "learning_rate": 9.214032258064516e-05, + "loss": 0.1498, + "step": 5374 + }, + { + "epoch": 0.086, + "grad_norm": 0.7578125, + "learning_rate": 9.213870967741936e-05, + "loss": 0.1683, + "step": 5375 + }, + { + "epoch": 0.086016, + "grad_norm": 0.65625, + "learning_rate": 9.213709677419356e-05, + "loss": 0.1942, + "step": 5376 + }, + { + "epoch": 0.086032, + "grad_norm": 0.5703125, + "learning_rate": 9.213548387096775e-05, + "loss": 0.1661, + "step": 5377 + }, + { + "epoch": 0.086048, + "grad_norm": 0.80078125, + "learning_rate": 9.213387096774195e-05, + "loss": 0.2177, + "step": 5378 + }, + { + "epoch": 0.086064, + "grad_norm": 0.94921875, + "learning_rate": 9.213225806451613e-05, + "loss": 0.2055, + "step": 5379 + }, + { + "epoch": 0.08608, + "grad_norm": 0.91015625, + "learning_rate": 9.213064516129033e-05, + "loss": 0.1702, + "step": 5380 + }, + { + "epoch": 0.086096, + "grad_norm": 1.3359375, + "learning_rate": 9.212903225806452e-05, + "loss": 0.1861, + "step": 5381 + }, + { + "epoch": 0.086112, + "grad_norm": 0.55859375, + "learning_rate": 9.212741935483872e-05, + "loss": 0.1516, + "step": 5382 + }, + { + "epoch": 0.086128, + "grad_norm": 0.9765625, + "learning_rate": 9.21258064516129e-05, + "loss": 0.128, + "step": 5383 + }, + { + "epoch": 0.086144, + "grad_norm": 0.76953125, + "learning_rate": 9.21241935483871e-05, + "loss": 0.189, + "step": 5384 + }, + { + "epoch": 0.08616, + "grad_norm": 0.69921875, + "learning_rate": 9.212258064516129e-05, + "loss": 0.1557, + "step": 5385 + }, + { + "epoch": 0.086176, + "grad_norm": 0.7890625, + "learning_rate": 9.212096774193549e-05, + "loss": 0.1593, + "step": 5386 + }, + { + "epoch": 0.086192, + "grad_norm": 0.9453125, + "learning_rate": 9.211935483870969e-05, + "loss": 0.2173, + "step": 5387 + }, + { + "epoch": 0.086208, + "grad_norm": 0.74609375, + "learning_rate": 9.211774193548387e-05, + "loss": 0.1803, + "step": 5388 + }, + { + "epoch": 0.086224, + "grad_norm": 0.9453125, + "learning_rate": 9.211612903225807e-05, + "loss": 0.197, + "step": 5389 + }, + { + "epoch": 0.08624, + "grad_norm": 0.9453125, + "learning_rate": 9.211451612903226e-05, + "loss": 0.1747, + "step": 5390 + }, + { + "epoch": 0.086256, + "grad_norm": 0.953125, + "learning_rate": 9.211290322580646e-05, + "loss": 0.1719, + "step": 5391 + }, + { + "epoch": 0.086272, + "grad_norm": 0.75, + "learning_rate": 9.211129032258065e-05, + "loss": 0.1852, + "step": 5392 + }, + { + "epoch": 0.086288, + "grad_norm": 1.0703125, + "learning_rate": 9.210967741935485e-05, + "loss": 0.1991, + "step": 5393 + }, + { + "epoch": 0.086304, + "grad_norm": 1.0859375, + "learning_rate": 9.210806451612903e-05, + "loss": 0.1968, + "step": 5394 + }, + { + "epoch": 0.08632, + "grad_norm": 0.6328125, + "learning_rate": 9.210645161290323e-05, + "loss": 0.171, + "step": 5395 + }, + { + "epoch": 0.086336, + "grad_norm": 0.87109375, + "learning_rate": 9.210483870967742e-05, + "loss": 0.1804, + "step": 5396 + }, + { + "epoch": 0.086352, + "grad_norm": 0.52734375, + "learning_rate": 9.210322580645162e-05, + "loss": 0.1655, + "step": 5397 + }, + { + "epoch": 0.086368, + "grad_norm": 0.75, + "learning_rate": 9.21016129032258e-05, + "loss": 0.1651, + "step": 5398 + }, + { + "epoch": 0.086384, + "grad_norm": 0.921875, + "learning_rate": 9.21e-05, + "loss": 0.1425, + "step": 5399 + }, + { + "epoch": 0.0864, + "grad_norm": 0.83984375, + "learning_rate": 9.20983870967742e-05, + "loss": 0.1864, + "step": 5400 + }, + { + "epoch": 0.086416, + "grad_norm": 0.875, + "learning_rate": 9.20967741935484e-05, + "loss": 0.2062, + "step": 5401 + }, + { + "epoch": 0.086432, + "grad_norm": 0.62109375, + "learning_rate": 9.209516129032259e-05, + "loss": 0.1621, + "step": 5402 + }, + { + "epoch": 0.086448, + "grad_norm": 2.109375, + "learning_rate": 9.209354838709679e-05, + "loss": 0.257, + "step": 5403 + }, + { + "epoch": 0.086464, + "grad_norm": 0.59765625, + "learning_rate": 9.209193548387097e-05, + "loss": 0.1538, + "step": 5404 + }, + { + "epoch": 0.08648, + "grad_norm": 0.69140625, + "learning_rate": 9.209032258064516e-05, + "loss": 0.1675, + "step": 5405 + }, + { + "epoch": 0.086496, + "grad_norm": 1.34375, + "learning_rate": 9.208870967741936e-05, + "loss": 0.1894, + "step": 5406 + }, + { + "epoch": 0.086512, + "grad_norm": 1.2265625, + "learning_rate": 9.208709677419355e-05, + "loss": 0.1682, + "step": 5407 + }, + { + "epoch": 0.086528, + "grad_norm": 0.640625, + "learning_rate": 9.208548387096774e-05, + "loss": 0.1681, + "step": 5408 + }, + { + "epoch": 0.086544, + "grad_norm": 0.52734375, + "learning_rate": 9.208387096774193e-05, + "loss": 0.1533, + "step": 5409 + }, + { + "epoch": 0.08656, + "grad_norm": 0.953125, + "learning_rate": 9.208225806451613e-05, + "loss": 0.1938, + "step": 5410 + }, + { + "epoch": 0.086576, + "grad_norm": 0.9609375, + "learning_rate": 9.208064516129033e-05, + "loss": 0.1894, + "step": 5411 + }, + { + "epoch": 0.086592, + "grad_norm": 1.1015625, + "learning_rate": 9.207903225806453e-05, + "loss": 0.215, + "step": 5412 + }, + { + "epoch": 0.086608, + "grad_norm": 0.7734375, + "learning_rate": 9.207741935483872e-05, + "loss": 0.182, + "step": 5413 + }, + { + "epoch": 0.086624, + "grad_norm": 0.6484375, + "learning_rate": 9.207580645161292e-05, + "loss": 0.1534, + "step": 5414 + }, + { + "epoch": 0.08664, + "grad_norm": 0.73828125, + "learning_rate": 9.20741935483871e-05, + "loss": 0.1824, + "step": 5415 + }, + { + "epoch": 0.086656, + "grad_norm": 0.490234375, + "learning_rate": 9.20725806451613e-05, + "loss": 0.1415, + "step": 5416 + }, + { + "epoch": 0.086672, + "grad_norm": 0.60546875, + "learning_rate": 9.207096774193549e-05, + "loss": 0.1666, + "step": 5417 + }, + { + "epoch": 0.086688, + "grad_norm": 0.7265625, + "learning_rate": 9.206935483870969e-05, + "loss": 0.1912, + "step": 5418 + }, + { + "epoch": 0.086704, + "grad_norm": 1.203125, + "learning_rate": 9.206774193548387e-05, + "loss": 0.1772, + "step": 5419 + }, + { + "epoch": 0.08672, + "grad_norm": 0.74609375, + "learning_rate": 9.206612903225806e-05, + "loss": 0.1786, + "step": 5420 + }, + { + "epoch": 0.086736, + "grad_norm": 0.53515625, + "learning_rate": 9.206451612903226e-05, + "loss": 0.1508, + "step": 5421 + }, + { + "epoch": 0.086752, + "grad_norm": 0.65625, + "learning_rate": 9.206290322580644e-05, + "loss": 0.1939, + "step": 5422 + }, + { + "epoch": 0.086768, + "grad_norm": 1.6484375, + "learning_rate": 9.206129032258064e-05, + "loss": 0.2117, + "step": 5423 + }, + { + "epoch": 0.086784, + "grad_norm": 0.75390625, + "learning_rate": 9.205967741935484e-05, + "loss": 0.1943, + "step": 5424 + }, + { + "epoch": 0.0868, + "grad_norm": 0.61328125, + "learning_rate": 9.205806451612904e-05, + "loss": 0.1745, + "step": 5425 + }, + { + "epoch": 0.086816, + "grad_norm": 0.85546875, + "learning_rate": 9.205645161290323e-05, + "loss": 0.1716, + "step": 5426 + }, + { + "epoch": 0.086832, + "grad_norm": 1.15625, + "learning_rate": 9.205483870967743e-05, + "loss": 0.2341, + "step": 5427 + }, + { + "epoch": 0.086848, + "grad_norm": 0.91796875, + "learning_rate": 9.205322580645162e-05, + "loss": 0.1883, + "step": 5428 + }, + { + "epoch": 0.086864, + "grad_norm": 0.9921875, + "learning_rate": 9.205161290322582e-05, + "loss": 0.1831, + "step": 5429 + }, + { + "epoch": 0.08688, + "grad_norm": 0.9296875, + "learning_rate": 9.205e-05, + "loss": 0.1985, + "step": 5430 + }, + { + "epoch": 0.086896, + "grad_norm": 0.9453125, + "learning_rate": 9.20483870967742e-05, + "loss": 0.201, + "step": 5431 + }, + { + "epoch": 0.086912, + "grad_norm": 1.125, + "learning_rate": 9.204677419354839e-05, + "loss": 0.2274, + "step": 5432 + }, + { + "epoch": 0.086928, + "grad_norm": 0.69921875, + "learning_rate": 9.204516129032259e-05, + "loss": 0.1698, + "step": 5433 + }, + { + "epoch": 0.086944, + "grad_norm": 0.61328125, + "learning_rate": 9.204354838709677e-05, + "loss": 0.1743, + "step": 5434 + }, + { + "epoch": 0.08696, + "grad_norm": 0.71484375, + "learning_rate": 9.204193548387097e-05, + "loss": 0.1926, + "step": 5435 + }, + { + "epoch": 0.086976, + "grad_norm": 0.7421875, + "learning_rate": 9.204032258064517e-05, + "loss": 0.1525, + "step": 5436 + }, + { + "epoch": 0.086992, + "grad_norm": 0.6015625, + "learning_rate": 9.203870967741936e-05, + "loss": 0.1519, + "step": 5437 + }, + { + "epoch": 0.087008, + "grad_norm": 1.515625, + "learning_rate": 9.203709677419356e-05, + "loss": 0.2205, + "step": 5438 + }, + { + "epoch": 0.087024, + "grad_norm": 0.64453125, + "learning_rate": 9.203548387096774e-05, + "loss": 0.164, + "step": 5439 + }, + { + "epoch": 0.08704, + "grad_norm": 1.359375, + "learning_rate": 9.203387096774194e-05, + "loss": 0.164, + "step": 5440 + }, + { + "epoch": 0.087056, + "grad_norm": 0.8359375, + "learning_rate": 9.203225806451613e-05, + "loss": 0.1325, + "step": 5441 + }, + { + "epoch": 0.087072, + "grad_norm": 1.53125, + "learning_rate": 9.203064516129033e-05, + "loss": 0.2436, + "step": 5442 + }, + { + "epoch": 0.087088, + "grad_norm": 0.63671875, + "learning_rate": 9.202903225806452e-05, + "loss": 0.164, + "step": 5443 + }, + { + "epoch": 0.087104, + "grad_norm": 0.88671875, + "learning_rate": 9.202741935483871e-05, + "loss": 0.1961, + "step": 5444 + }, + { + "epoch": 0.08712, + "grad_norm": 0.77734375, + "learning_rate": 9.20258064516129e-05, + "loss": 0.1712, + "step": 5445 + }, + { + "epoch": 0.087136, + "grad_norm": 0.984375, + "learning_rate": 9.20241935483871e-05, + "loss": 0.1724, + "step": 5446 + }, + { + "epoch": 0.087152, + "grad_norm": 1.0234375, + "learning_rate": 9.20225806451613e-05, + "loss": 0.1675, + "step": 5447 + }, + { + "epoch": 0.087168, + "grad_norm": 1.3671875, + "learning_rate": 9.20209677419355e-05, + "loss": 0.2119, + "step": 5448 + }, + { + "epoch": 0.087184, + "grad_norm": 0.81640625, + "learning_rate": 9.201935483870969e-05, + "loss": 0.2271, + "step": 5449 + }, + { + "epoch": 0.0872, + "grad_norm": 0.71484375, + "learning_rate": 9.201774193548387e-05, + "loss": 0.1942, + "step": 5450 + }, + { + "epoch": 0.087216, + "grad_norm": 0.76953125, + "learning_rate": 9.201612903225807e-05, + "loss": 0.1447, + "step": 5451 + }, + { + "epoch": 0.087232, + "grad_norm": 0.87109375, + "learning_rate": 9.201451612903226e-05, + "loss": 0.2061, + "step": 5452 + }, + { + "epoch": 0.087248, + "grad_norm": 0.79296875, + "learning_rate": 9.201290322580646e-05, + "loss": 0.2052, + "step": 5453 + }, + { + "epoch": 0.087264, + "grad_norm": 1.953125, + "learning_rate": 9.201129032258064e-05, + "loss": 0.1864, + "step": 5454 + }, + { + "epoch": 0.08728, + "grad_norm": 1.296875, + "learning_rate": 9.200967741935484e-05, + "loss": 0.1821, + "step": 5455 + }, + { + "epoch": 0.087296, + "grad_norm": 1.6484375, + "learning_rate": 9.200806451612903e-05, + "loss": 0.1847, + "step": 5456 + }, + { + "epoch": 0.087312, + "grad_norm": 0.9375, + "learning_rate": 9.200645161290323e-05, + "loss": 0.2177, + "step": 5457 + }, + { + "epoch": 0.087328, + "grad_norm": 0.55859375, + "learning_rate": 9.200483870967741e-05, + "loss": 0.1543, + "step": 5458 + }, + { + "epoch": 0.087344, + "grad_norm": 1.8125, + "learning_rate": 9.200322580645161e-05, + "loss": 0.1767, + "step": 5459 + }, + { + "epoch": 0.08736, + "grad_norm": 0.55859375, + "learning_rate": 9.200161290322581e-05, + "loss": 0.167, + "step": 5460 + }, + { + "epoch": 0.087376, + "grad_norm": 0.77734375, + "learning_rate": 9.200000000000001e-05, + "loss": 0.1518, + "step": 5461 + }, + { + "epoch": 0.087392, + "grad_norm": 1.015625, + "learning_rate": 9.19983870967742e-05, + "loss": 0.1833, + "step": 5462 + }, + { + "epoch": 0.087408, + "grad_norm": 0.8203125, + "learning_rate": 9.19967741935484e-05, + "loss": 0.1763, + "step": 5463 + }, + { + "epoch": 0.087424, + "grad_norm": 0.83203125, + "learning_rate": 9.199516129032259e-05, + "loss": 0.2047, + "step": 5464 + }, + { + "epoch": 0.08744, + "grad_norm": 1.296875, + "learning_rate": 9.199354838709679e-05, + "loss": 0.1798, + "step": 5465 + }, + { + "epoch": 0.087456, + "grad_norm": 1.3515625, + "learning_rate": 9.199193548387097e-05, + "loss": 0.1853, + "step": 5466 + }, + { + "epoch": 0.087472, + "grad_norm": 0.6953125, + "learning_rate": 9.199032258064516e-05, + "loss": 0.1859, + "step": 5467 + }, + { + "epoch": 0.087488, + "grad_norm": 0.6171875, + "learning_rate": 9.198870967741936e-05, + "loss": 0.1721, + "step": 5468 + }, + { + "epoch": 0.087504, + "grad_norm": 0.54296875, + "learning_rate": 9.198709677419354e-05, + "loss": 0.1772, + "step": 5469 + }, + { + "epoch": 0.08752, + "grad_norm": 0.69140625, + "learning_rate": 9.198548387096774e-05, + "loss": 0.2085, + "step": 5470 + }, + { + "epoch": 0.087536, + "grad_norm": 1.0625, + "learning_rate": 9.198387096774194e-05, + "loss": 0.2085, + "step": 5471 + }, + { + "epoch": 0.087552, + "grad_norm": 1.15625, + "learning_rate": 9.198225806451614e-05, + "loss": 0.1641, + "step": 5472 + }, + { + "epoch": 0.087568, + "grad_norm": 0.8984375, + "learning_rate": 9.198064516129033e-05, + "loss": 0.1914, + "step": 5473 + }, + { + "epoch": 0.087584, + "grad_norm": 0.6484375, + "learning_rate": 9.197903225806453e-05, + "loss": 0.1784, + "step": 5474 + }, + { + "epoch": 0.0876, + "grad_norm": 1.1796875, + "learning_rate": 9.197741935483871e-05, + "loss": 0.1437, + "step": 5475 + }, + { + "epoch": 0.087616, + "grad_norm": 0.72265625, + "learning_rate": 9.197580645161291e-05, + "loss": 0.1672, + "step": 5476 + }, + { + "epoch": 0.087632, + "grad_norm": 1.171875, + "learning_rate": 9.19741935483871e-05, + "loss": 0.2004, + "step": 5477 + }, + { + "epoch": 0.087648, + "grad_norm": 0.70703125, + "learning_rate": 9.19725806451613e-05, + "loss": 0.1784, + "step": 5478 + }, + { + "epoch": 0.087664, + "grad_norm": 1.453125, + "learning_rate": 9.197096774193548e-05, + "loss": 0.2439, + "step": 5479 + }, + { + "epoch": 0.08768, + "grad_norm": 0.640625, + "learning_rate": 9.196935483870968e-05, + "loss": 0.1695, + "step": 5480 + }, + { + "epoch": 0.087696, + "grad_norm": 1.1328125, + "learning_rate": 9.196774193548387e-05, + "loss": 0.1824, + "step": 5481 + }, + { + "epoch": 0.087712, + "grad_norm": 0.61328125, + "learning_rate": 9.196612903225807e-05, + "loss": 0.1792, + "step": 5482 + }, + { + "epoch": 0.087728, + "grad_norm": 0.703125, + "learning_rate": 9.196451612903227e-05, + "loss": 0.1668, + "step": 5483 + }, + { + "epoch": 0.087744, + "grad_norm": 0.8984375, + "learning_rate": 9.196290322580646e-05, + "loss": 0.1783, + "step": 5484 + }, + { + "epoch": 0.08776, + "grad_norm": 0.734375, + "learning_rate": 9.196129032258066e-05, + "loss": 0.1497, + "step": 5485 + }, + { + "epoch": 0.087776, + "grad_norm": 0.9609375, + "learning_rate": 9.195967741935484e-05, + "loss": 0.1562, + "step": 5486 + }, + { + "epoch": 0.087792, + "grad_norm": 0.921875, + "learning_rate": 9.195806451612904e-05, + "loss": 0.1832, + "step": 5487 + }, + { + "epoch": 0.087808, + "grad_norm": 1.15625, + "learning_rate": 9.195645161290323e-05, + "loss": 0.1749, + "step": 5488 + }, + { + "epoch": 0.087824, + "grad_norm": 1.4375, + "learning_rate": 9.195483870967743e-05, + "loss": 0.2118, + "step": 5489 + }, + { + "epoch": 0.08784, + "grad_norm": 0.91796875, + "learning_rate": 9.195322580645161e-05, + "loss": 0.2265, + "step": 5490 + }, + { + "epoch": 0.087856, + "grad_norm": 1.9609375, + "learning_rate": 9.195161290322581e-05, + "loss": 0.2289, + "step": 5491 + }, + { + "epoch": 0.087872, + "grad_norm": 1.3359375, + "learning_rate": 9.195e-05, + "loss": 0.1681, + "step": 5492 + }, + { + "epoch": 0.087888, + "grad_norm": 0.94140625, + "learning_rate": 9.19483870967742e-05, + "loss": 0.1788, + "step": 5493 + }, + { + "epoch": 0.087904, + "grad_norm": 0.6875, + "learning_rate": 9.194677419354838e-05, + "loss": 0.1783, + "step": 5494 + }, + { + "epoch": 0.08792, + "grad_norm": 0.71484375, + "learning_rate": 9.194516129032258e-05, + "loss": 0.1701, + "step": 5495 + }, + { + "epoch": 0.087936, + "grad_norm": 1.0625, + "learning_rate": 9.194354838709678e-05, + "loss": 0.1779, + "step": 5496 + }, + { + "epoch": 0.087952, + "grad_norm": 1.1015625, + "learning_rate": 9.194193548387097e-05, + "loss": 0.2077, + "step": 5497 + }, + { + "epoch": 0.087968, + "grad_norm": 1.0078125, + "learning_rate": 9.194032258064517e-05, + "loss": 0.1677, + "step": 5498 + }, + { + "epoch": 0.087984, + "grad_norm": 0.90234375, + "learning_rate": 9.193870967741936e-05, + "loss": 0.2093, + "step": 5499 + }, + { + "epoch": 0.088, + "grad_norm": 1.0078125, + "learning_rate": 9.193709677419356e-05, + "loss": 0.2172, + "step": 5500 + }, + { + "epoch": 0.088016, + "grad_norm": 0.5703125, + "learning_rate": 9.193548387096774e-05, + "loss": 0.1705, + "step": 5501 + }, + { + "epoch": 0.088032, + "grad_norm": 0.69140625, + "learning_rate": 9.193387096774194e-05, + "loss": 0.1689, + "step": 5502 + }, + { + "epoch": 0.088048, + "grad_norm": 0.7890625, + "learning_rate": 9.193225806451613e-05, + "loss": 0.1822, + "step": 5503 + }, + { + "epoch": 0.088064, + "grad_norm": 0.8046875, + "learning_rate": 9.193064516129033e-05, + "loss": 0.1544, + "step": 5504 + }, + { + "epoch": 0.08808, + "grad_norm": 0.78125, + "learning_rate": 9.192903225806451e-05, + "loss": 0.1542, + "step": 5505 + }, + { + "epoch": 0.088096, + "grad_norm": 0.65234375, + "learning_rate": 9.192741935483871e-05, + "loss": 0.1945, + "step": 5506 + }, + { + "epoch": 0.088112, + "grad_norm": 0.71875, + "learning_rate": 9.192580645161291e-05, + "loss": 0.175, + "step": 5507 + }, + { + "epoch": 0.088128, + "grad_norm": 0.8046875, + "learning_rate": 9.192419354838711e-05, + "loss": 0.1637, + "step": 5508 + }, + { + "epoch": 0.088144, + "grad_norm": 0.7265625, + "learning_rate": 9.19225806451613e-05, + "loss": 0.2008, + "step": 5509 + }, + { + "epoch": 0.08816, + "grad_norm": 0.76171875, + "learning_rate": 9.19209677419355e-05, + "loss": 0.2039, + "step": 5510 + }, + { + "epoch": 0.088176, + "grad_norm": 0.67578125, + "learning_rate": 9.191935483870968e-05, + "loss": 0.1775, + "step": 5511 + }, + { + "epoch": 0.088192, + "grad_norm": 0.73828125, + "learning_rate": 9.191774193548388e-05, + "loss": 0.1784, + "step": 5512 + }, + { + "epoch": 0.088208, + "grad_norm": 0.6171875, + "learning_rate": 9.191612903225807e-05, + "loss": 0.1964, + "step": 5513 + }, + { + "epoch": 0.088224, + "grad_norm": 0.69140625, + "learning_rate": 9.191451612903226e-05, + "loss": 0.1908, + "step": 5514 + }, + { + "epoch": 0.08824, + "grad_norm": 0.71484375, + "learning_rate": 9.191290322580645e-05, + "loss": 0.155, + "step": 5515 + }, + { + "epoch": 0.088256, + "grad_norm": 0.6328125, + "learning_rate": 9.191129032258064e-05, + "loss": 0.1487, + "step": 5516 + }, + { + "epoch": 0.088272, + "grad_norm": 0.97265625, + "learning_rate": 9.190967741935484e-05, + "loss": 0.2028, + "step": 5517 + }, + { + "epoch": 0.088288, + "grad_norm": 0.63671875, + "learning_rate": 9.190806451612903e-05, + "loss": 0.1601, + "step": 5518 + }, + { + "epoch": 0.088304, + "grad_norm": 0.58984375, + "learning_rate": 9.190645161290323e-05, + "loss": 0.205, + "step": 5519 + }, + { + "epoch": 0.08832, + "grad_norm": 0.89453125, + "learning_rate": 9.190483870967743e-05, + "loss": 0.1774, + "step": 5520 + }, + { + "epoch": 0.088336, + "grad_norm": 0.90625, + "learning_rate": 9.190322580645163e-05, + "loss": 0.199, + "step": 5521 + }, + { + "epoch": 0.088352, + "grad_norm": 0.9296875, + "learning_rate": 9.190161290322581e-05, + "loss": 0.1751, + "step": 5522 + }, + { + "epoch": 0.088368, + "grad_norm": 0.87890625, + "learning_rate": 9.190000000000001e-05, + "loss": 0.1612, + "step": 5523 + }, + { + "epoch": 0.088384, + "grad_norm": 0.7890625, + "learning_rate": 9.18983870967742e-05, + "loss": 0.1652, + "step": 5524 + }, + { + "epoch": 0.0884, + "grad_norm": 0.8203125, + "learning_rate": 9.18967741935484e-05, + "loss": 0.1821, + "step": 5525 + }, + { + "epoch": 0.088416, + "grad_norm": 0.81640625, + "learning_rate": 9.189516129032258e-05, + "loss": 0.163, + "step": 5526 + }, + { + "epoch": 0.088432, + "grad_norm": 0.84765625, + "learning_rate": 9.189354838709678e-05, + "loss": 0.2083, + "step": 5527 + }, + { + "epoch": 0.088448, + "grad_norm": 1.2890625, + "learning_rate": 9.189193548387097e-05, + "loss": 0.1565, + "step": 5528 + }, + { + "epoch": 0.088464, + "grad_norm": 1.3359375, + "learning_rate": 9.189032258064515e-05, + "loss": 0.1732, + "step": 5529 + }, + { + "epoch": 0.08848, + "grad_norm": 0.8359375, + "learning_rate": 9.188870967741935e-05, + "loss": 0.2179, + "step": 5530 + }, + { + "epoch": 0.088496, + "grad_norm": 0.65234375, + "learning_rate": 9.188709677419355e-05, + "loss": 0.127, + "step": 5531 + }, + { + "epoch": 0.088512, + "grad_norm": 0.76953125, + "learning_rate": 9.188548387096775e-05, + "loss": 0.1393, + "step": 5532 + }, + { + "epoch": 0.088528, + "grad_norm": 1.0703125, + "learning_rate": 9.188387096774194e-05, + "loss": 0.1745, + "step": 5533 + }, + { + "epoch": 0.088544, + "grad_norm": 1.2109375, + "learning_rate": 9.188225806451614e-05, + "loss": 0.2382, + "step": 5534 + }, + { + "epoch": 0.08856, + "grad_norm": 0.83984375, + "learning_rate": 9.188064516129033e-05, + "loss": 0.1816, + "step": 5535 + }, + { + "epoch": 0.088576, + "grad_norm": 0.90234375, + "learning_rate": 9.187903225806453e-05, + "loss": 0.1913, + "step": 5536 + }, + { + "epoch": 0.088592, + "grad_norm": 0.97265625, + "learning_rate": 9.187741935483871e-05, + "loss": 0.1317, + "step": 5537 + }, + { + "epoch": 0.088608, + "grad_norm": 0.765625, + "learning_rate": 9.187580645161291e-05, + "loss": 0.1889, + "step": 5538 + }, + { + "epoch": 0.088624, + "grad_norm": 0.63671875, + "learning_rate": 9.18741935483871e-05, + "loss": 0.1611, + "step": 5539 + }, + { + "epoch": 0.08864, + "grad_norm": 1.1171875, + "learning_rate": 9.18725806451613e-05, + "loss": 0.1836, + "step": 5540 + }, + { + "epoch": 0.088656, + "grad_norm": 1.421875, + "learning_rate": 9.187096774193548e-05, + "loss": 0.1664, + "step": 5541 + }, + { + "epoch": 0.088672, + "grad_norm": 0.640625, + "learning_rate": 9.186935483870968e-05, + "loss": 0.1867, + "step": 5542 + }, + { + "epoch": 0.088688, + "grad_norm": 0.70703125, + "learning_rate": 9.186774193548388e-05, + "loss": 0.1738, + "step": 5543 + }, + { + "epoch": 0.088704, + "grad_norm": 0.6328125, + "learning_rate": 9.186612903225807e-05, + "loss": 0.1423, + "step": 5544 + }, + { + "epoch": 0.08872, + "grad_norm": 1.03125, + "learning_rate": 9.186451612903227e-05, + "loss": 0.166, + "step": 5545 + }, + { + "epoch": 0.088736, + "grad_norm": 0.5234375, + "learning_rate": 9.186290322580645e-05, + "loss": 0.1492, + "step": 5546 + }, + { + "epoch": 0.088752, + "grad_norm": 0.70703125, + "learning_rate": 9.186129032258065e-05, + "loss": 0.2107, + "step": 5547 + }, + { + "epoch": 0.088768, + "grad_norm": 0.6171875, + "learning_rate": 9.185967741935484e-05, + "loss": 0.1721, + "step": 5548 + }, + { + "epoch": 0.088784, + "grad_norm": 0.6640625, + "learning_rate": 9.185806451612904e-05, + "loss": 0.1469, + "step": 5549 + }, + { + "epoch": 0.0888, + "grad_norm": 0.828125, + "learning_rate": 9.185645161290323e-05, + "loss": 0.1852, + "step": 5550 + }, + { + "epoch": 0.088816, + "grad_norm": 0.8203125, + "learning_rate": 9.185483870967742e-05, + "loss": 0.1711, + "step": 5551 + }, + { + "epoch": 0.088832, + "grad_norm": 0.6796875, + "learning_rate": 9.185322580645161e-05, + "loss": 0.1848, + "step": 5552 + }, + { + "epoch": 0.088848, + "grad_norm": 0.86328125, + "learning_rate": 9.185161290322581e-05, + "loss": 0.1837, + "step": 5553 + }, + { + "epoch": 0.088864, + "grad_norm": 0.9375, + "learning_rate": 9.185e-05, + "loss": 0.1396, + "step": 5554 + }, + { + "epoch": 0.08888, + "grad_norm": 0.6171875, + "learning_rate": 9.18483870967742e-05, + "loss": 0.1381, + "step": 5555 + }, + { + "epoch": 0.088896, + "grad_norm": 0.6953125, + "learning_rate": 9.18467741935484e-05, + "loss": 0.1294, + "step": 5556 + }, + { + "epoch": 0.088912, + "grad_norm": 0.96875, + "learning_rate": 9.18451612903226e-05, + "loss": 0.1864, + "step": 5557 + }, + { + "epoch": 0.088928, + "grad_norm": 0.7734375, + "learning_rate": 9.184354838709678e-05, + "loss": 0.1644, + "step": 5558 + }, + { + "epoch": 0.088944, + "grad_norm": 1.4453125, + "learning_rate": 9.184193548387097e-05, + "loss": 0.1741, + "step": 5559 + }, + { + "epoch": 0.08896, + "grad_norm": 0.73046875, + "learning_rate": 9.184032258064517e-05, + "loss": 0.1998, + "step": 5560 + }, + { + "epoch": 0.088976, + "grad_norm": 0.65234375, + "learning_rate": 9.183870967741935e-05, + "loss": 0.1794, + "step": 5561 + }, + { + "epoch": 0.088992, + "grad_norm": 0.62109375, + "learning_rate": 9.183709677419355e-05, + "loss": 0.1699, + "step": 5562 + }, + { + "epoch": 0.089008, + "grad_norm": 0.80859375, + "learning_rate": 9.183548387096774e-05, + "loss": 0.1456, + "step": 5563 + }, + { + "epoch": 0.089024, + "grad_norm": 1.0625, + "learning_rate": 9.183387096774194e-05, + "loss": 0.1812, + "step": 5564 + }, + { + "epoch": 0.08904, + "grad_norm": 1.078125, + "learning_rate": 9.183225806451612e-05, + "loss": 0.2052, + "step": 5565 + }, + { + "epoch": 0.089056, + "grad_norm": 0.75, + "learning_rate": 9.183064516129032e-05, + "loss": 0.1681, + "step": 5566 + }, + { + "epoch": 0.089072, + "grad_norm": 0.69140625, + "learning_rate": 9.182903225806452e-05, + "loss": 0.1797, + "step": 5567 + }, + { + "epoch": 0.089088, + "grad_norm": 0.79296875, + "learning_rate": 9.182741935483872e-05, + "loss": 0.1841, + "step": 5568 + }, + { + "epoch": 0.089104, + "grad_norm": 1.03125, + "learning_rate": 9.182580645161291e-05, + "loss": 0.2327, + "step": 5569 + }, + { + "epoch": 0.08912, + "grad_norm": 0.7265625, + "learning_rate": 9.182419354838711e-05, + "loss": 0.1689, + "step": 5570 + }, + { + "epoch": 0.089136, + "grad_norm": 0.78125, + "learning_rate": 9.18225806451613e-05, + "loss": 0.1553, + "step": 5571 + }, + { + "epoch": 0.089152, + "grad_norm": 0.7109375, + "learning_rate": 9.18209677419355e-05, + "loss": 0.1457, + "step": 5572 + }, + { + "epoch": 0.089168, + "grad_norm": 0.6796875, + "learning_rate": 9.181935483870968e-05, + "loss": 0.1993, + "step": 5573 + }, + { + "epoch": 0.089184, + "grad_norm": 0.66796875, + "learning_rate": 9.181774193548388e-05, + "loss": 0.1358, + "step": 5574 + }, + { + "epoch": 0.0892, + "grad_norm": 0.67578125, + "learning_rate": 9.181612903225807e-05, + "loss": 0.1509, + "step": 5575 + }, + { + "epoch": 0.089216, + "grad_norm": 1.015625, + "learning_rate": 9.181451612903225e-05, + "loss": 0.1652, + "step": 5576 + }, + { + "epoch": 0.089232, + "grad_norm": 1.3671875, + "learning_rate": 9.181290322580645e-05, + "loss": 0.196, + "step": 5577 + }, + { + "epoch": 0.089248, + "grad_norm": 0.84765625, + "learning_rate": 9.181129032258065e-05, + "loss": 0.2005, + "step": 5578 + }, + { + "epoch": 0.089264, + "grad_norm": 0.71875, + "learning_rate": 9.180967741935484e-05, + "loss": 0.2054, + "step": 5579 + }, + { + "epoch": 0.08928, + "grad_norm": 0.6640625, + "learning_rate": 9.180806451612904e-05, + "loss": 0.1801, + "step": 5580 + }, + { + "epoch": 0.089296, + "grad_norm": 0.546875, + "learning_rate": 9.180645161290324e-05, + "loss": 0.1665, + "step": 5581 + }, + { + "epoch": 0.089312, + "grad_norm": 0.89453125, + "learning_rate": 9.180483870967742e-05, + "loss": 0.17, + "step": 5582 + }, + { + "epoch": 0.089328, + "grad_norm": 0.6171875, + "learning_rate": 9.180322580645162e-05, + "loss": 0.1754, + "step": 5583 + }, + { + "epoch": 0.089344, + "grad_norm": 0.91796875, + "learning_rate": 9.180161290322581e-05, + "loss": 0.1652, + "step": 5584 + }, + { + "epoch": 0.08936, + "grad_norm": 0.87890625, + "learning_rate": 9.180000000000001e-05, + "loss": 0.1917, + "step": 5585 + }, + { + "epoch": 0.089376, + "grad_norm": 0.86328125, + "learning_rate": 9.17983870967742e-05, + "loss": 0.1667, + "step": 5586 + }, + { + "epoch": 0.089392, + "grad_norm": 0.625, + "learning_rate": 9.17967741935484e-05, + "loss": 0.1762, + "step": 5587 + }, + { + "epoch": 0.089408, + "grad_norm": 0.93359375, + "learning_rate": 9.179516129032258e-05, + "loss": 0.137, + "step": 5588 + }, + { + "epoch": 0.089424, + "grad_norm": 1.2109375, + "learning_rate": 9.179354838709678e-05, + "loss": 0.2214, + "step": 5589 + }, + { + "epoch": 0.08944, + "grad_norm": 0.6015625, + "learning_rate": 9.179193548387097e-05, + "loss": 0.1833, + "step": 5590 + }, + { + "epoch": 0.089456, + "grad_norm": 0.765625, + "learning_rate": 9.179032258064517e-05, + "loss": 0.2061, + "step": 5591 + }, + { + "epoch": 0.089472, + "grad_norm": 0.74609375, + "learning_rate": 9.178870967741937e-05, + "loss": 0.1532, + "step": 5592 + }, + { + "epoch": 0.089488, + "grad_norm": 0.73828125, + "learning_rate": 9.178709677419355e-05, + "loss": 0.2267, + "step": 5593 + }, + { + "epoch": 0.089504, + "grad_norm": 0.66796875, + "learning_rate": 9.178548387096775e-05, + "loss": 0.1859, + "step": 5594 + }, + { + "epoch": 0.08952, + "grad_norm": 1.0234375, + "learning_rate": 9.178387096774194e-05, + "loss": 0.1858, + "step": 5595 + }, + { + "epoch": 0.089536, + "grad_norm": 0.99609375, + "learning_rate": 9.178225806451614e-05, + "loss": 0.2036, + "step": 5596 + }, + { + "epoch": 0.089552, + "grad_norm": 1.0703125, + "learning_rate": 9.178064516129032e-05, + "loss": 0.154, + "step": 5597 + }, + { + "epoch": 0.089568, + "grad_norm": 0.64453125, + "learning_rate": 9.177903225806452e-05, + "loss": 0.1731, + "step": 5598 + }, + { + "epoch": 0.089584, + "grad_norm": 0.6953125, + "learning_rate": 9.177741935483871e-05, + "loss": 0.1599, + "step": 5599 + }, + { + "epoch": 0.0896, + "grad_norm": 0.80859375, + "learning_rate": 9.177580645161291e-05, + "loss": 0.1846, + "step": 5600 + }, + { + "epoch": 0.089616, + "grad_norm": 0.92578125, + "learning_rate": 9.17741935483871e-05, + "loss": 0.1493, + "step": 5601 + }, + { + "epoch": 0.089632, + "grad_norm": 0.95703125, + "learning_rate": 9.17725806451613e-05, + "loss": 0.1766, + "step": 5602 + }, + { + "epoch": 0.089648, + "grad_norm": 0.67578125, + "learning_rate": 9.17709677419355e-05, + "loss": 0.1736, + "step": 5603 + }, + { + "epoch": 0.089664, + "grad_norm": 1.3515625, + "learning_rate": 9.17693548387097e-05, + "loss": 0.1539, + "step": 5604 + }, + { + "epoch": 0.08968, + "grad_norm": 0.76953125, + "learning_rate": 9.176774193548388e-05, + "loss": 0.1692, + "step": 5605 + }, + { + "epoch": 0.089696, + "grad_norm": 0.76953125, + "learning_rate": 9.176612903225807e-05, + "loss": 0.1986, + "step": 5606 + }, + { + "epoch": 0.089712, + "grad_norm": 0.859375, + "learning_rate": 9.176451612903227e-05, + "loss": 0.1617, + "step": 5607 + }, + { + "epoch": 0.089728, + "grad_norm": 0.5625, + "learning_rate": 9.176290322580645e-05, + "loss": 0.1822, + "step": 5608 + }, + { + "epoch": 0.089744, + "grad_norm": 0.66015625, + "learning_rate": 9.176129032258065e-05, + "loss": 0.1809, + "step": 5609 + }, + { + "epoch": 0.08976, + "grad_norm": 0.62109375, + "learning_rate": 9.175967741935484e-05, + "loss": 0.1268, + "step": 5610 + }, + { + "epoch": 0.089776, + "grad_norm": 0.6015625, + "learning_rate": 9.175806451612904e-05, + "loss": 0.1816, + "step": 5611 + }, + { + "epoch": 0.089792, + "grad_norm": 0.498046875, + "learning_rate": 9.175645161290322e-05, + "loss": 0.1628, + "step": 5612 + }, + { + "epoch": 0.089808, + "grad_norm": 0.72265625, + "learning_rate": 9.175483870967742e-05, + "loss": 0.1796, + "step": 5613 + }, + { + "epoch": 0.089824, + "grad_norm": 0.734375, + "learning_rate": 9.175322580645161e-05, + "loss": 0.1943, + "step": 5614 + }, + { + "epoch": 0.08984, + "grad_norm": 0.8203125, + "learning_rate": 9.175161290322581e-05, + "loss": 0.1284, + "step": 5615 + }, + { + "epoch": 0.089856, + "grad_norm": 1.0546875, + "learning_rate": 9.175000000000001e-05, + "loss": 0.1461, + "step": 5616 + }, + { + "epoch": 0.089872, + "grad_norm": 0.7421875, + "learning_rate": 9.174838709677421e-05, + "loss": 0.1865, + "step": 5617 + }, + { + "epoch": 0.089888, + "grad_norm": 0.99609375, + "learning_rate": 9.17467741935484e-05, + "loss": 0.1988, + "step": 5618 + }, + { + "epoch": 0.089904, + "grad_norm": 0.671875, + "learning_rate": 9.174516129032259e-05, + "loss": 0.1727, + "step": 5619 + }, + { + "epoch": 0.08992, + "grad_norm": 0.76171875, + "learning_rate": 9.174354838709678e-05, + "loss": 0.1909, + "step": 5620 + }, + { + "epoch": 0.089936, + "grad_norm": 1.1484375, + "learning_rate": 9.174193548387097e-05, + "loss": 0.1488, + "step": 5621 + }, + { + "epoch": 0.089952, + "grad_norm": 0.76953125, + "learning_rate": 9.174032258064516e-05, + "loss": 0.1819, + "step": 5622 + }, + { + "epoch": 0.089968, + "grad_norm": 1.015625, + "learning_rate": 9.173870967741935e-05, + "loss": 0.2102, + "step": 5623 + }, + { + "epoch": 0.089984, + "grad_norm": 0.7890625, + "learning_rate": 9.173709677419355e-05, + "loss": 0.1659, + "step": 5624 + }, + { + "epoch": 0.09, + "grad_norm": 1.578125, + "learning_rate": 9.173548387096774e-05, + "loss": 0.2047, + "step": 5625 + }, + { + "epoch": 0.090016, + "grad_norm": 0.7265625, + "learning_rate": 9.173387096774194e-05, + "loss": 0.1807, + "step": 5626 + }, + { + "epoch": 0.090032, + "grad_norm": 1.2421875, + "learning_rate": 9.173225806451614e-05, + "loss": 0.1733, + "step": 5627 + }, + { + "epoch": 0.090048, + "grad_norm": 0.78125, + "learning_rate": 9.173064516129034e-05, + "loss": 0.1809, + "step": 5628 + }, + { + "epoch": 0.090064, + "grad_norm": 0.71875, + "learning_rate": 9.172903225806452e-05, + "loss": 0.1378, + "step": 5629 + }, + { + "epoch": 0.09008, + "grad_norm": 0.89453125, + "learning_rate": 9.172741935483872e-05, + "loss": 0.1387, + "step": 5630 + }, + { + "epoch": 0.090096, + "grad_norm": 0.73828125, + "learning_rate": 9.172580645161291e-05, + "loss": 0.1305, + "step": 5631 + }, + { + "epoch": 0.090112, + "grad_norm": 0.66015625, + "learning_rate": 9.17241935483871e-05, + "loss": 0.1879, + "step": 5632 + }, + { + "epoch": 0.090128, + "grad_norm": 0.6171875, + "learning_rate": 9.172258064516129e-05, + "loss": 0.1332, + "step": 5633 + }, + { + "epoch": 0.090144, + "grad_norm": 0.77734375, + "learning_rate": 9.172096774193549e-05, + "loss": 0.1525, + "step": 5634 + }, + { + "epoch": 0.09016, + "grad_norm": 0.5546875, + "learning_rate": 9.171935483870968e-05, + "loss": 0.1696, + "step": 5635 + }, + { + "epoch": 0.090176, + "grad_norm": 0.65625, + "learning_rate": 9.171774193548388e-05, + "loss": 0.1907, + "step": 5636 + }, + { + "epoch": 0.090192, + "grad_norm": 0.71484375, + "learning_rate": 9.171612903225806e-05, + "loss": 0.2091, + "step": 5637 + }, + { + "epoch": 0.090208, + "grad_norm": 0.87109375, + "learning_rate": 9.171451612903226e-05, + "loss": 0.1912, + "step": 5638 + }, + { + "epoch": 0.090224, + "grad_norm": 0.64453125, + "learning_rate": 9.171290322580646e-05, + "loss": 0.1728, + "step": 5639 + }, + { + "epoch": 0.09024, + "grad_norm": 0.765625, + "learning_rate": 9.171129032258065e-05, + "loss": 0.1871, + "step": 5640 + }, + { + "epoch": 0.090256, + "grad_norm": 0.95703125, + "learning_rate": 9.170967741935485e-05, + "loss": 0.1458, + "step": 5641 + }, + { + "epoch": 0.090272, + "grad_norm": 0.77734375, + "learning_rate": 9.170806451612904e-05, + "loss": 0.1594, + "step": 5642 + }, + { + "epoch": 0.090288, + "grad_norm": 0.7109375, + "learning_rate": 9.170645161290323e-05, + "loss": 0.1588, + "step": 5643 + }, + { + "epoch": 0.090304, + "grad_norm": 0.6875, + "learning_rate": 9.170483870967742e-05, + "loss": 0.1491, + "step": 5644 + }, + { + "epoch": 0.09032, + "grad_norm": 1.15625, + "learning_rate": 9.170322580645162e-05, + "loss": 0.1616, + "step": 5645 + }, + { + "epoch": 0.090336, + "grad_norm": 0.953125, + "learning_rate": 9.17016129032258e-05, + "loss": 0.1765, + "step": 5646 + }, + { + "epoch": 0.090352, + "grad_norm": 1.1953125, + "learning_rate": 9.17e-05, + "loss": 0.2104, + "step": 5647 + }, + { + "epoch": 0.090368, + "grad_norm": 0.8984375, + "learning_rate": 9.169838709677419e-05, + "loss": 0.1945, + "step": 5648 + }, + { + "epoch": 0.090384, + "grad_norm": 0.7109375, + "learning_rate": 9.169677419354839e-05, + "loss": 0.162, + "step": 5649 + }, + { + "epoch": 0.0904, + "grad_norm": 0.98828125, + "learning_rate": 9.169516129032258e-05, + "loss": 0.1703, + "step": 5650 + }, + { + "epoch": 0.090416, + "grad_norm": 1.6015625, + "learning_rate": 9.169354838709678e-05, + "loss": 0.1719, + "step": 5651 + }, + { + "epoch": 0.090432, + "grad_norm": 1.296875, + "learning_rate": 9.169193548387098e-05, + "loss": 0.1807, + "step": 5652 + }, + { + "epoch": 0.090448, + "grad_norm": 1.265625, + "learning_rate": 9.169032258064516e-05, + "loss": 0.1936, + "step": 5653 + }, + { + "epoch": 0.090464, + "grad_norm": 0.75, + "learning_rate": 9.168870967741936e-05, + "loss": 0.1697, + "step": 5654 + }, + { + "epoch": 0.09048, + "grad_norm": 0.78515625, + "learning_rate": 9.168709677419355e-05, + "loss": 0.1737, + "step": 5655 + }, + { + "epoch": 0.090496, + "grad_norm": 0.609375, + "learning_rate": 9.168548387096775e-05, + "loss": 0.1703, + "step": 5656 + }, + { + "epoch": 0.090512, + "grad_norm": 0.66015625, + "learning_rate": 9.168387096774193e-05, + "loss": 0.1705, + "step": 5657 + }, + { + "epoch": 0.090528, + "grad_norm": 0.79296875, + "learning_rate": 9.168225806451613e-05, + "loss": 0.2018, + "step": 5658 + }, + { + "epoch": 0.090544, + "grad_norm": 0.75390625, + "learning_rate": 9.168064516129032e-05, + "loss": 0.1513, + "step": 5659 + }, + { + "epoch": 0.09056, + "grad_norm": 0.7421875, + "learning_rate": 9.167903225806452e-05, + "loss": 0.1966, + "step": 5660 + }, + { + "epoch": 0.090576, + "grad_norm": 0.66796875, + "learning_rate": 9.16774193548387e-05, + "loss": 0.1698, + "step": 5661 + }, + { + "epoch": 0.090592, + "grad_norm": 0.98828125, + "learning_rate": 9.16758064516129e-05, + "loss": 0.189, + "step": 5662 + }, + { + "epoch": 0.090608, + "grad_norm": 0.81640625, + "learning_rate": 9.16741935483871e-05, + "loss": 0.1953, + "step": 5663 + }, + { + "epoch": 0.090624, + "grad_norm": 0.79296875, + "learning_rate": 9.16725806451613e-05, + "loss": 0.2101, + "step": 5664 + }, + { + "epoch": 0.09064, + "grad_norm": 0.74609375, + "learning_rate": 9.167096774193549e-05, + "loss": 0.1614, + "step": 5665 + }, + { + "epoch": 0.090656, + "grad_norm": 0.6875, + "learning_rate": 9.166935483870969e-05, + "loss": 0.1606, + "step": 5666 + }, + { + "epoch": 0.090672, + "grad_norm": 0.7265625, + "learning_rate": 9.166774193548388e-05, + "loss": 0.1835, + "step": 5667 + }, + { + "epoch": 0.090688, + "grad_norm": 1.140625, + "learning_rate": 9.166612903225806e-05, + "loss": 0.1825, + "step": 5668 + }, + { + "epoch": 0.090704, + "grad_norm": 0.65234375, + "learning_rate": 9.166451612903226e-05, + "loss": 0.1784, + "step": 5669 + }, + { + "epoch": 0.09072, + "grad_norm": 0.625, + "learning_rate": 9.166290322580645e-05, + "loss": 0.1547, + "step": 5670 + }, + { + "epoch": 0.090736, + "grad_norm": 0.81640625, + "learning_rate": 9.166129032258065e-05, + "loss": 0.2351, + "step": 5671 + }, + { + "epoch": 0.090752, + "grad_norm": 1.203125, + "learning_rate": 9.165967741935483e-05, + "loss": 0.1657, + "step": 5672 + }, + { + "epoch": 0.090768, + "grad_norm": 0.96484375, + "learning_rate": 9.165806451612903e-05, + "loss": 0.1757, + "step": 5673 + }, + { + "epoch": 0.090784, + "grad_norm": 0.8203125, + "learning_rate": 9.165645161290322e-05, + "loss": 0.201, + "step": 5674 + }, + { + "epoch": 0.0908, + "grad_norm": 0.6796875, + "learning_rate": 9.165483870967742e-05, + "loss": 0.1975, + "step": 5675 + }, + { + "epoch": 0.090816, + "grad_norm": 0.64453125, + "learning_rate": 9.165322580645162e-05, + "loss": 0.1607, + "step": 5676 + }, + { + "epoch": 0.090832, + "grad_norm": 0.77734375, + "learning_rate": 9.165161290322582e-05, + "loss": 0.1566, + "step": 5677 + }, + { + "epoch": 0.090848, + "grad_norm": 0.76953125, + "learning_rate": 9.165e-05, + "loss": 0.1696, + "step": 5678 + }, + { + "epoch": 0.090864, + "grad_norm": 0.95703125, + "learning_rate": 9.16483870967742e-05, + "loss": 0.2062, + "step": 5679 + }, + { + "epoch": 0.09088, + "grad_norm": 0.69140625, + "learning_rate": 9.164677419354839e-05, + "loss": 0.1883, + "step": 5680 + }, + { + "epoch": 0.090896, + "grad_norm": 0.76953125, + "learning_rate": 9.164516129032259e-05, + "loss": 0.1936, + "step": 5681 + }, + { + "epoch": 0.090912, + "grad_norm": 1.3046875, + "learning_rate": 9.164354838709678e-05, + "loss": 0.1774, + "step": 5682 + }, + { + "epoch": 0.090928, + "grad_norm": 0.56640625, + "learning_rate": 9.164193548387098e-05, + "loss": 0.1538, + "step": 5683 + }, + { + "epoch": 0.090944, + "grad_norm": 0.77734375, + "learning_rate": 9.164032258064516e-05, + "loss": 0.1996, + "step": 5684 + }, + { + "epoch": 0.09096, + "grad_norm": 0.91796875, + "learning_rate": 9.163870967741935e-05, + "loss": 0.1768, + "step": 5685 + }, + { + "epoch": 0.090976, + "grad_norm": 0.96484375, + "learning_rate": 9.163709677419355e-05, + "loss": 0.1757, + "step": 5686 + }, + { + "epoch": 0.090992, + "grad_norm": 0.7109375, + "learning_rate": 9.163548387096775e-05, + "loss": 0.1649, + "step": 5687 + }, + { + "epoch": 0.091008, + "grad_norm": 1.0546875, + "learning_rate": 9.163387096774195e-05, + "loss": 0.1442, + "step": 5688 + }, + { + "epoch": 0.091024, + "grad_norm": 0.671875, + "learning_rate": 9.163225806451613e-05, + "loss": 0.2031, + "step": 5689 + }, + { + "epoch": 0.09104, + "grad_norm": 0.70703125, + "learning_rate": 9.163064516129033e-05, + "loss": 0.19, + "step": 5690 + }, + { + "epoch": 0.091056, + "grad_norm": 1.03125, + "learning_rate": 9.162903225806452e-05, + "loss": 0.17, + "step": 5691 + }, + { + "epoch": 0.091072, + "grad_norm": 1.0703125, + "learning_rate": 9.162741935483872e-05, + "loss": 0.1365, + "step": 5692 + }, + { + "epoch": 0.091088, + "grad_norm": 0.6328125, + "learning_rate": 9.16258064516129e-05, + "loss": 0.2053, + "step": 5693 + }, + { + "epoch": 0.091104, + "grad_norm": 0.8984375, + "learning_rate": 9.16241935483871e-05, + "loss": 0.1969, + "step": 5694 + }, + { + "epoch": 0.09112, + "grad_norm": 0.6640625, + "learning_rate": 9.162258064516129e-05, + "loss": 0.2045, + "step": 5695 + }, + { + "epoch": 0.091136, + "grad_norm": 0.96484375, + "learning_rate": 9.162096774193549e-05, + "loss": 0.1749, + "step": 5696 + }, + { + "epoch": 0.091152, + "grad_norm": 0.85546875, + "learning_rate": 9.161935483870968e-05, + "loss": 0.2097, + "step": 5697 + }, + { + "epoch": 0.091168, + "grad_norm": 1.1328125, + "learning_rate": 9.161774193548388e-05, + "loss": 0.2156, + "step": 5698 + }, + { + "epoch": 0.091184, + "grad_norm": 0.59375, + "learning_rate": 9.161612903225808e-05, + "loss": 0.1665, + "step": 5699 + }, + { + "epoch": 0.0912, + "grad_norm": 0.94140625, + "learning_rate": 9.161451612903226e-05, + "loss": 0.1375, + "step": 5700 + }, + { + "epoch": 0.091216, + "grad_norm": 0.6953125, + "learning_rate": 9.161290322580646e-05, + "loss": 0.2084, + "step": 5701 + }, + { + "epoch": 0.091232, + "grad_norm": 0.765625, + "learning_rate": 9.161129032258065e-05, + "loss": 0.215, + "step": 5702 + }, + { + "epoch": 0.091248, + "grad_norm": 0.68359375, + "learning_rate": 9.160967741935485e-05, + "loss": 0.1985, + "step": 5703 + }, + { + "epoch": 0.091264, + "grad_norm": 1.21875, + "learning_rate": 9.160806451612903e-05, + "loss": 0.1664, + "step": 5704 + }, + { + "epoch": 0.09128, + "grad_norm": 1.1015625, + "learning_rate": 9.160645161290323e-05, + "loss": 0.2183, + "step": 5705 + }, + { + "epoch": 0.091296, + "grad_norm": 0.89453125, + "learning_rate": 9.160483870967742e-05, + "loss": 0.2135, + "step": 5706 + }, + { + "epoch": 0.091312, + "grad_norm": 0.6328125, + "learning_rate": 9.160322580645162e-05, + "loss": 0.1599, + "step": 5707 + }, + { + "epoch": 0.091328, + "grad_norm": 0.86328125, + "learning_rate": 9.16016129032258e-05, + "loss": 0.1796, + "step": 5708 + }, + { + "epoch": 0.091344, + "grad_norm": 0.88671875, + "learning_rate": 9.16e-05, + "loss": 0.1652, + "step": 5709 + }, + { + "epoch": 0.09136, + "grad_norm": 0.9453125, + "learning_rate": 9.159838709677419e-05, + "loss": 0.2137, + "step": 5710 + }, + { + "epoch": 0.091376, + "grad_norm": 0.6015625, + "learning_rate": 9.159677419354839e-05, + "loss": 0.15, + "step": 5711 + }, + { + "epoch": 0.091392, + "grad_norm": 0.796875, + "learning_rate": 9.159516129032259e-05, + "loss": 0.1768, + "step": 5712 + }, + { + "epoch": 0.091408, + "grad_norm": 0.78515625, + "learning_rate": 9.159354838709679e-05, + "loss": 0.1604, + "step": 5713 + }, + { + "epoch": 0.091424, + "grad_norm": 0.8359375, + "learning_rate": 9.159193548387097e-05, + "loss": 0.1436, + "step": 5714 + }, + { + "epoch": 0.09144, + "grad_norm": 0.65625, + "learning_rate": 9.159032258064516e-05, + "loss": 0.1735, + "step": 5715 + }, + { + "epoch": 0.091456, + "grad_norm": 1.078125, + "learning_rate": 9.158870967741936e-05, + "loss": 0.1726, + "step": 5716 + }, + { + "epoch": 0.091472, + "grad_norm": 1.03125, + "learning_rate": 9.158709677419355e-05, + "loss": 0.1998, + "step": 5717 + }, + { + "epoch": 0.091488, + "grad_norm": 1.0703125, + "learning_rate": 9.158548387096775e-05, + "loss": 0.1839, + "step": 5718 + }, + { + "epoch": 0.091504, + "grad_norm": 0.71484375, + "learning_rate": 9.158387096774193e-05, + "loss": 0.1737, + "step": 5719 + }, + { + "epoch": 0.09152, + "grad_norm": 1.2734375, + "learning_rate": 9.158225806451613e-05, + "loss": 0.1844, + "step": 5720 + }, + { + "epoch": 0.091536, + "grad_norm": 0.578125, + "learning_rate": 9.158064516129032e-05, + "loss": 0.1425, + "step": 5721 + }, + { + "epoch": 0.091552, + "grad_norm": 0.6796875, + "learning_rate": 9.157903225806452e-05, + "loss": 0.1473, + "step": 5722 + }, + { + "epoch": 0.091568, + "grad_norm": 0.8671875, + "learning_rate": 9.157741935483872e-05, + "loss": 0.1935, + "step": 5723 + }, + { + "epoch": 0.091584, + "grad_norm": 0.890625, + "learning_rate": 9.157580645161292e-05, + "loss": 0.1546, + "step": 5724 + }, + { + "epoch": 0.0916, + "grad_norm": 0.88671875, + "learning_rate": 9.15741935483871e-05, + "loss": 0.1701, + "step": 5725 + }, + { + "epoch": 0.091616, + "grad_norm": 0.84765625, + "learning_rate": 9.15725806451613e-05, + "loss": 0.1685, + "step": 5726 + }, + { + "epoch": 0.091632, + "grad_norm": 0.59765625, + "learning_rate": 9.157096774193549e-05, + "loss": 0.1271, + "step": 5727 + }, + { + "epoch": 0.091648, + "grad_norm": 0.703125, + "learning_rate": 9.156935483870969e-05, + "loss": 0.1506, + "step": 5728 + }, + { + "epoch": 0.091664, + "grad_norm": 0.92578125, + "learning_rate": 9.156774193548387e-05, + "loss": 0.1783, + "step": 5729 + }, + { + "epoch": 0.09168, + "grad_norm": 0.875, + "learning_rate": 9.156612903225806e-05, + "loss": 0.1977, + "step": 5730 + }, + { + "epoch": 0.091696, + "grad_norm": 0.9296875, + "learning_rate": 9.156451612903226e-05, + "loss": 0.1775, + "step": 5731 + }, + { + "epoch": 0.091712, + "grad_norm": 0.69921875, + "learning_rate": 9.156290322580645e-05, + "loss": 0.1947, + "step": 5732 + }, + { + "epoch": 0.091728, + "grad_norm": 0.97265625, + "learning_rate": 9.156129032258065e-05, + "loss": 0.1758, + "step": 5733 + }, + { + "epoch": 0.091744, + "grad_norm": 0.921875, + "learning_rate": 9.155967741935485e-05, + "loss": 0.1566, + "step": 5734 + }, + { + "epoch": 0.09176, + "grad_norm": 0.703125, + "learning_rate": 9.155806451612905e-05, + "loss": 0.1346, + "step": 5735 + }, + { + "epoch": 0.091776, + "grad_norm": 1.09375, + "learning_rate": 9.155645161290323e-05, + "loss": 0.206, + "step": 5736 + }, + { + "epoch": 0.091792, + "grad_norm": 0.921875, + "learning_rate": 9.155483870967743e-05, + "loss": 0.1412, + "step": 5737 + }, + { + "epoch": 0.091808, + "grad_norm": 0.9453125, + "learning_rate": 9.155322580645162e-05, + "loss": 0.1305, + "step": 5738 + }, + { + "epoch": 0.091824, + "grad_norm": 0.89453125, + "learning_rate": 9.155161290322582e-05, + "loss": 0.1422, + "step": 5739 + }, + { + "epoch": 0.09184, + "grad_norm": 0.58203125, + "learning_rate": 9.155e-05, + "loss": 0.1442, + "step": 5740 + }, + { + "epoch": 0.091856, + "grad_norm": 0.7421875, + "learning_rate": 9.15483870967742e-05, + "loss": 0.2071, + "step": 5741 + }, + { + "epoch": 0.091872, + "grad_norm": 1.4609375, + "learning_rate": 9.154677419354839e-05, + "loss": 0.2004, + "step": 5742 + }, + { + "epoch": 0.091888, + "grad_norm": 0.74609375, + "learning_rate": 9.154516129032259e-05, + "loss": 0.1553, + "step": 5743 + }, + { + "epoch": 0.091904, + "grad_norm": 0.5234375, + "learning_rate": 9.154354838709677e-05, + "loss": 0.1553, + "step": 5744 + }, + { + "epoch": 0.09192, + "grad_norm": 0.765625, + "learning_rate": 9.154193548387097e-05, + "loss": 0.1753, + "step": 5745 + }, + { + "epoch": 0.091936, + "grad_norm": 0.96875, + "learning_rate": 9.154032258064516e-05, + "loss": 0.1735, + "step": 5746 + }, + { + "epoch": 0.091952, + "grad_norm": 0.953125, + "learning_rate": 9.153870967741936e-05, + "loss": 0.1672, + "step": 5747 + }, + { + "epoch": 0.091968, + "grad_norm": 0.84765625, + "learning_rate": 9.153709677419356e-05, + "loss": 0.2162, + "step": 5748 + }, + { + "epoch": 0.091984, + "grad_norm": 0.8203125, + "learning_rate": 9.153548387096775e-05, + "loss": 0.1803, + "step": 5749 + }, + { + "epoch": 0.092, + "grad_norm": 0.71875, + "learning_rate": 9.153387096774194e-05, + "loss": 0.1984, + "step": 5750 + }, + { + "epoch": 0.092016, + "grad_norm": 0.7109375, + "learning_rate": 9.153225806451613e-05, + "loss": 0.1311, + "step": 5751 + }, + { + "epoch": 0.092032, + "grad_norm": 1.0625, + "learning_rate": 9.153064516129033e-05, + "loss": 0.196, + "step": 5752 + }, + { + "epoch": 0.092048, + "grad_norm": 0.828125, + "learning_rate": 9.152903225806452e-05, + "loss": 0.154, + "step": 5753 + }, + { + "epoch": 0.092064, + "grad_norm": 0.56640625, + "learning_rate": 9.152741935483872e-05, + "loss": 0.1583, + "step": 5754 + }, + { + "epoch": 0.09208, + "grad_norm": 0.97265625, + "learning_rate": 9.15258064516129e-05, + "loss": 0.1939, + "step": 5755 + }, + { + "epoch": 0.092096, + "grad_norm": 0.83203125, + "learning_rate": 9.15241935483871e-05, + "loss": 0.1692, + "step": 5756 + }, + { + "epoch": 0.092112, + "grad_norm": 0.9375, + "learning_rate": 9.152258064516129e-05, + "loss": 0.1773, + "step": 5757 + }, + { + "epoch": 0.092128, + "grad_norm": 0.78125, + "learning_rate": 9.152096774193549e-05, + "loss": 0.1225, + "step": 5758 + }, + { + "epoch": 0.092144, + "grad_norm": 1.2578125, + "learning_rate": 9.151935483870969e-05, + "loss": 0.1958, + "step": 5759 + }, + { + "epoch": 0.09216, + "grad_norm": 0.96484375, + "learning_rate": 9.151774193548389e-05, + "loss": 0.1951, + "step": 5760 + }, + { + "epoch": 0.092176, + "grad_norm": 1.1015625, + "learning_rate": 9.151612903225807e-05, + "loss": 0.1883, + "step": 5761 + }, + { + "epoch": 0.092192, + "grad_norm": 0.8359375, + "learning_rate": 9.151451612903226e-05, + "loss": 0.2053, + "step": 5762 + }, + { + "epoch": 0.092208, + "grad_norm": 0.86328125, + "learning_rate": 9.151290322580646e-05, + "loss": 0.2196, + "step": 5763 + }, + { + "epoch": 0.092224, + "grad_norm": 0.7734375, + "learning_rate": 9.151129032258064e-05, + "loss": 0.1895, + "step": 5764 + }, + { + "epoch": 0.09224, + "grad_norm": 0.8828125, + "learning_rate": 9.150967741935484e-05, + "loss": 0.1779, + "step": 5765 + }, + { + "epoch": 0.092256, + "grad_norm": 0.79296875, + "learning_rate": 9.150806451612903e-05, + "loss": 0.1775, + "step": 5766 + }, + { + "epoch": 0.092272, + "grad_norm": 0.55078125, + "learning_rate": 9.150645161290323e-05, + "loss": 0.1665, + "step": 5767 + }, + { + "epoch": 0.092288, + "grad_norm": 0.91796875, + "learning_rate": 9.150483870967742e-05, + "loss": 0.1697, + "step": 5768 + }, + { + "epoch": 0.092304, + "grad_norm": 0.88671875, + "learning_rate": 9.150322580645162e-05, + "loss": 0.1683, + "step": 5769 + }, + { + "epoch": 0.09232, + "grad_norm": 0.74609375, + "learning_rate": 9.15016129032258e-05, + "loss": 0.163, + "step": 5770 + }, + { + "epoch": 0.092336, + "grad_norm": 0.984375, + "learning_rate": 9.15e-05, + "loss": 0.1901, + "step": 5771 + }, + { + "epoch": 0.092352, + "grad_norm": 0.76953125, + "learning_rate": 9.14983870967742e-05, + "loss": 0.1754, + "step": 5772 + }, + { + "epoch": 0.092368, + "grad_norm": 1.109375, + "learning_rate": 9.14967741935484e-05, + "loss": 0.1404, + "step": 5773 + }, + { + "epoch": 0.092384, + "grad_norm": 0.57421875, + "learning_rate": 9.149516129032259e-05, + "loss": 0.1787, + "step": 5774 + }, + { + "epoch": 0.0924, + "grad_norm": 0.85546875, + "learning_rate": 9.149354838709679e-05, + "loss": 0.1834, + "step": 5775 + }, + { + "epoch": 0.092416, + "grad_norm": 1.1953125, + "learning_rate": 9.149193548387097e-05, + "loss": 0.1689, + "step": 5776 + }, + { + "epoch": 0.092432, + "grad_norm": 0.78125, + "learning_rate": 9.149032258064516e-05, + "loss": 0.1832, + "step": 5777 + }, + { + "epoch": 0.092448, + "grad_norm": 1.1171875, + "learning_rate": 9.148870967741936e-05, + "loss": 0.2085, + "step": 5778 + }, + { + "epoch": 0.092464, + "grad_norm": 0.7734375, + "learning_rate": 9.148709677419354e-05, + "loss": 0.1702, + "step": 5779 + }, + { + "epoch": 0.09248, + "grad_norm": 0.9140625, + "learning_rate": 9.148548387096774e-05, + "loss": 0.1924, + "step": 5780 + }, + { + "epoch": 0.092496, + "grad_norm": 0.88671875, + "learning_rate": 9.148387096774193e-05, + "loss": 0.1572, + "step": 5781 + }, + { + "epoch": 0.092512, + "grad_norm": 0.87890625, + "learning_rate": 9.148225806451613e-05, + "loss": 0.1598, + "step": 5782 + }, + { + "epoch": 0.092528, + "grad_norm": 1.078125, + "learning_rate": 9.148064516129033e-05, + "loss": 0.2084, + "step": 5783 + }, + { + "epoch": 0.092544, + "grad_norm": 1.1171875, + "learning_rate": 9.147903225806453e-05, + "loss": 0.1887, + "step": 5784 + }, + { + "epoch": 0.09256, + "grad_norm": 0.60546875, + "learning_rate": 9.147741935483871e-05, + "loss": 0.1904, + "step": 5785 + }, + { + "epoch": 0.092576, + "grad_norm": 0.78125, + "learning_rate": 9.147580645161291e-05, + "loss": 0.1895, + "step": 5786 + }, + { + "epoch": 0.092592, + "grad_norm": 0.91015625, + "learning_rate": 9.14741935483871e-05, + "loss": 0.1906, + "step": 5787 + }, + { + "epoch": 0.092608, + "grad_norm": 0.88671875, + "learning_rate": 9.14725806451613e-05, + "loss": 0.192, + "step": 5788 + }, + { + "epoch": 0.092624, + "grad_norm": 1.0, + "learning_rate": 9.147096774193549e-05, + "loss": 0.2351, + "step": 5789 + }, + { + "epoch": 0.09264, + "grad_norm": 0.84375, + "learning_rate": 9.146935483870969e-05, + "loss": 0.1616, + "step": 5790 + }, + { + "epoch": 0.092656, + "grad_norm": 0.6796875, + "learning_rate": 9.146774193548387e-05, + "loss": 0.1896, + "step": 5791 + }, + { + "epoch": 0.092672, + "grad_norm": 0.94921875, + "learning_rate": 9.146612903225807e-05, + "loss": 0.1989, + "step": 5792 + }, + { + "epoch": 0.092688, + "grad_norm": 0.89453125, + "learning_rate": 9.146451612903226e-05, + "loss": 0.1767, + "step": 5793 + }, + { + "epoch": 0.092704, + "grad_norm": 0.70703125, + "learning_rate": 9.146290322580646e-05, + "loss": 0.1498, + "step": 5794 + }, + { + "epoch": 0.09272, + "grad_norm": 0.83203125, + "learning_rate": 9.146129032258066e-05, + "loss": 0.1912, + "step": 5795 + }, + { + "epoch": 0.092736, + "grad_norm": 1.0625, + "learning_rate": 9.145967741935484e-05, + "loss": 0.1939, + "step": 5796 + }, + { + "epoch": 0.092752, + "grad_norm": 0.56640625, + "learning_rate": 9.145806451612904e-05, + "loss": 0.1506, + "step": 5797 + }, + { + "epoch": 0.092768, + "grad_norm": 0.7734375, + "learning_rate": 9.145645161290323e-05, + "loss": 0.1608, + "step": 5798 + }, + { + "epoch": 0.092784, + "grad_norm": 0.76171875, + "learning_rate": 9.145483870967743e-05, + "loss": 0.1524, + "step": 5799 + }, + { + "epoch": 0.0928, + "grad_norm": 0.765625, + "learning_rate": 9.145322580645161e-05, + "loss": 0.2551, + "step": 5800 + }, + { + "epoch": 0.092816, + "grad_norm": 1.2421875, + "learning_rate": 9.145161290322581e-05, + "loss": 0.2027, + "step": 5801 + }, + { + "epoch": 0.092832, + "grad_norm": 1.4609375, + "learning_rate": 9.145e-05, + "loss": 0.1792, + "step": 5802 + }, + { + "epoch": 0.092848, + "grad_norm": 0.55859375, + "learning_rate": 9.14483870967742e-05, + "loss": 0.1511, + "step": 5803 + }, + { + "epoch": 0.092864, + "grad_norm": 0.6875, + "learning_rate": 9.144677419354839e-05, + "loss": 0.1474, + "step": 5804 + }, + { + "epoch": 0.09288, + "grad_norm": 0.95703125, + "learning_rate": 9.144516129032259e-05, + "loss": 0.1943, + "step": 5805 + }, + { + "epoch": 0.092896, + "grad_norm": 0.75, + "learning_rate": 9.144354838709677e-05, + "loss": 0.1792, + "step": 5806 + }, + { + "epoch": 0.092912, + "grad_norm": 0.77734375, + "learning_rate": 9.144193548387097e-05, + "loss": 0.166, + "step": 5807 + }, + { + "epoch": 0.092928, + "grad_norm": 0.47265625, + "learning_rate": 9.144032258064517e-05, + "loss": 0.1475, + "step": 5808 + }, + { + "epoch": 0.092944, + "grad_norm": 0.68359375, + "learning_rate": 9.143870967741936e-05, + "loss": 0.195, + "step": 5809 + }, + { + "epoch": 0.09296, + "grad_norm": 0.734375, + "learning_rate": 9.143709677419356e-05, + "loss": 0.1424, + "step": 5810 + }, + { + "epoch": 0.092976, + "grad_norm": 0.7265625, + "learning_rate": 9.143548387096774e-05, + "loss": 0.1964, + "step": 5811 + }, + { + "epoch": 0.092992, + "grad_norm": 1.296875, + "learning_rate": 9.143387096774194e-05, + "loss": 0.2049, + "step": 5812 + }, + { + "epoch": 0.093008, + "grad_norm": 0.6328125, + "learning_rate": 9.143225806451613e-05, + "loss": 0.1622, + "step": 5813 + }, + { + "epoch": 0.093024, + "grad_norm": 1.0625, + "learning_rate": 9.143064516129033e-05, + "loss": 0.1983, + "step": 5814 + }, + { + "epoch": 0.09304, + "grad_norm": 0.76953125, + "learning_rate": 9.142903225806451e-05, + "loss": 0.1932, + "step": 5815 + }, + { + "epoch": 0.093056, + "grad_norm": 1.046875, + "learning_rate": 9.142741935483871e-05, + "loss": 0.1938, + "step": 5816 + }, + { + "epoch": 0.093072, + "grad_norm": 0.63671875, + "learning_rate": 9.14258064516129e-05, + "loss": 0.1825, + "step": 5817 + }, + { + "epoch": 0.093088, + "grad_norm": 0.6328125, + "learning_rate": 9.14241935483871e-05, + "loss": 0.2051, + "step": 5818 + }, + { + "epoch": 0.093104, + "grad_norm": 0.9609375, + "learning_rate": 9.14225806451613e-05, + "loss": 0.2102, + "step": 5819 + }, + { + "epoch": 0.09312, + "grad_norm": 0.83984375, + "learning_rate": 9.14209677419355e-05, + "loss": 0.1582, + "step": 5820 + }, + { + "epoch": 0.093136, + "grad_norm": 1.265625, + "learning_rate": 9.141935483870968e-05, + "loss": 0.187, + "step": 5821 + }, + { + "epoch": 0.093152, + "grad_norm": 0.625, + "learning_rate": 9.141774193548388e-05, + "loss": 0.1689, + "step": 5822 + }, + { + "epoch": 0.093168, + "grad_norm": 0.71484375, + "learning_rate": 9.141612903225807e-05, + "loss": 0.1232, + "step": 5823 + }, + { + "epoch": 0.093184, + "grad_norm": 1.0546875, + "learning_rate": 9.141451612903226e-05, + "loss": 0.2064, + "step": 5824 + }, + { + "epoch": 0.0932, + "grad_norm": 0.609375, + "learning_rate": 9.141290322580646e-05, + "loss": 0.166, + "step": 5825 + }, + { + "epoch": 0.093216, + "grad_norm": 0.63671875, + "learning_rate": 9.141129032258064e-05, + "loss": 0.1734, + "step": 5826 + }, + { + "epoch": 0.093232, + "grad_norm": 0.671875, + "learning_rate": 9.140967741935484e-05, + "loss": 0.1753, + "step": 5827 + }, + { + "epoch": 0.093248, + "grad_norm": 0.94140625, + "learning_rate": 9.140806451612903e-05, + "loss": 0.1705, + "step": 5828 + }, + { + "epoch": 0.093264, + "grad_norm": 1.359375, + "learning_rate": 9.140645161290323e-05, + "loss": 0.1985, + "step": 5829 + }, + { + "epoch": 0.09328, + "grad_norm": 0.62109375, + "learning_rate": 9.140483870967743e-05, + "loss": 0.1703, + "step": 5830 + }, + { + "epoch": 0.093296, + "grad_norm": 0.8671875, + "learning_rate": 9.140322580645161e-05, + "loss": 0.1549, + "step": 5831 + }, + { + "epoch": 0.093312, + "grad_norm": 0.94140625, + "learning_rate": 9.140161290322581e-05, + "loss": 0.1629, + "step": 5832 + }, + { + "epoch": 0.093328, + "grad_norm": 0.73828125, + "learning_rate": 9.140000000000001e-05, + "loss": 0.1946, + "step": 5833 + }, + { + "epoch": 0.093344, + "grad_norm": 1.4375, + "learning_rate": 9.13983870967742e-05, + "loss": 0.1786, + "step": 5834 + }, + { + "epoch": 0.09336, + "grad_norm": 1.3046875, + "learning_rate": 9.13967741935484e-05, + "loss": 0.1773, + "step": 5835 + }, + { + "epoch": 0.093376, + "grad_norm": 0.6953125, + "learning_rate": 9.139516129032258e-05, + "loss": 0.1875, + "step": 5836 + }, + { + "epoch": 0.093392, + "grad_norm": 1.28125, + "learning_rate": 9.139354838709678e-05, + "loss": 0.1802, + "step": 5837 + }, + { + "epoch": 0.093408, + "grad_norm": 0.77734375, + "learning_rate": 9.139193548387097e-05, + "loss": 0.1592, + "step": 5838 + }, + { + "epoch": 0.093424, + "grad_norm": 0.69140625, + "learning_rate": 9.139032258064516e-05, + "loss": 0.166, + "step": 5839 + }, + { + "epoch": 0.09344, + "grad_norm": 1.0078125, + "learning_rate": 9.138870967741936e-05, + "loss": 0.1729, + "step": 5840 + }, + { + "epoch": 0.093456, + "grad_norm": 0.9140625, + "learning_rate": 9.138709677419354e-05, + "loss": 0.1734, + "step": 5841 + }, + { + "epoch": 0.093472, + "grad_norm": 1.1328125, + "learning_rate": 9.138548387096774e-05, + "loss": 0.2272, + "step": 5842 + }, + { + "epoch": 0.093488, + "grad_norm": 0.6484375, + "learning_rate": 9.138387096774194e-05, + "loss": 0.1796, + "step": 5843 + }, + { + "epoch": 0.093504, + "grad_norm": 0.8203125, + "learning_rate": 9.138225806451614e-05, + "loss": 0.1701, + "step": 5844 + }, + { + "epoch": 0.09352, + "grad_norm": 0.78515625, + "learning_rate": 9.138064516129033e-05, + "loss": 0.1901, + "step": 5845 + }, + { + "epoch": 0.093536, + "grad_norm": 0.77734375, + "learning_rate": 9.137903225806453e-05, + "loss": 0.1932, + "step": 5846 + }, + { + "epoch": 0.093552, + "grad_norm": 0.97265625, + "learning_rate": 9.137741935483871e-05, + "loss": 0.1905, + "step": 5847 + }, + { + "epoch": 0.093568, + "grad_norm": 0.765625, + "learning_rate": 9.137580645161291e-05, + "loss": 0.172, + "step": 5848 + }, + { + "epoch": 0.093584, + "grad_norm": 0.78515625, + "learning_rate": 9.13741935483871e-05, + "loss": 0.1632, + "step": 5849 + }, + { + "epoch": 0.0936, + "grad_norm": 0.75, + "learning_rate": 9.13725806451613e-05, + "loss": 0.1636, + "step": 5850 + }, + { + "epoch": 0.093616, + "grad_norm": 0.65625, + "learning_rate": 9.137096774193548e-05, + "loss": 0.157, + "step": 5851 + }, + { + "epoch": 0.093632, + "grad_norm": 0.6640625, + "learning_rate": 9.136935483870968e-05, + "loss": 0.1808, + "step": 5852 + }, + { + "epoch": 0.093648, + "grad_norm": 0.7578125, + "learning_rate": 9.136774193548387e-05, + "loss": 0.137, + "step": 5853 + }, + { + "epoch": 0.093664, + "grad_norm": 0.88671875, + "learning_rate": 9.136612903225807e-05, + "loss": 0.2035, + "step": 5854 + }, + { + "epoch": 0.09368, + "grad_norm": 0.52734375, + "learning_rate": 9.136451612903227e-05, + "loss": 0.1339, + "step": 5855 + }, + { + "epoch": 0.093696, + "grad_norm": 0.5859375, + "learning_rate": 9.136290322580645e-05, + "loss": 0.1525, + "step": 5856 + }, + { + "epoch": 0.093712, + "grad_norm": 0.80859375, + "learning_rate": 9.136129032258065e-05, + "loss": 0.151, + "step": 5857 + }, + { + "epoch": 0.093728, + "grad_norm": 0.921875, + "learning_rate": 9.135967741935484e-05, + "loss": 0.1938, + "step": 5858 + }, + { + "epoch": 0.093744, + "grad_norm": 0.80078125, + "learning_rate": 9.135806451612904e-05, + "loss": 0.1756, + "step": 5859 + }, + { + "epoch": 0.09376, + "grad_norm": 1.0078125, + "learning_rate": 9.135645161290323e-05, + "loss": 0.2047, + "step": 5860 + }, + { + "epoch": 0.093776, + "grad_norm": 0.57421875, + "learning_rate": 9.135483870967743e-05, + "loss": 0.17, + "step": 5861 + }, + { + "epoch": 0.093792, + "grad_norm": 0.74609375, + "learning_rate": 9.135322580645161e-05, + "loss": 0.2126, + "step": 5862 + }, + { + "epoch": 0.093808, + "grad_norm": 0.80859375, + "learning_rate": 9.135161290322581e-05, + "loss": 0.1792, + "step": 5863 + }, + { + "epoch": 0.093824, + "grad_norm": 0.78125, + "learning_rate": 9.135e-05, + "loss": 0.2191, + "step": 5864 + }, + { + "epoch": 0.09384, + "grad_norm": 1.0625, + "learning_rate": 9.13483870967742e-05, + "loss": 0.2126, + "step": 5865 + }, + { + "epoch": 0.093856, + "grad_norm": 1.1171875, + "learning_rate": 9.134677419354838e-05, + "loss": 0.1626, + "step": 5866 + }, + { + "epoch": 0.093872, + "grad_norm": 1.7109375, + "learning_rate": 9.134516129032258e-05, + "loss": 0.1956, + "step": 5867 + }, + { + "epoch": 0.093888, + "grad_norm": 0.859375, + "learning_rate": 9.134354838709678e-05, + "loss": 0.1798, + "step": 5868 + }, + { + "epoch": 0.093904, + "grad_norm": 0.578125, + "learning_rate": 9.134193548387098e-05, + "loss": 0.1732, + "step": 5869 + }, + { + "epoch": 0.09392, + "grad_norm": 0.8203125, + "learning_rate": 9.134032258064517e-05, + "loss": 0.1797, + "step": 5870 + }, + { + "epoch": 0.093936, + "grad_norm": 1.453125, + "learning_rate": 9.133870967741935e-05, + "loss": 0.1924, + "step": 5871 + }, + { + "epoch": 0.093952, + "grad_norm": 0.984375, + "learning_rate": 9.133709677419355e-05, + "loss": 0.2293, + "step": 5872 + }, + { + "epoch": 0.093968, + "grad_norm": 1.1328125, + "learning_rate": 9.133548387096774e-05, + "loss": 0.1805, + "step": 5873 + }, + { + "epoch": 0.093984, + "grad_norm": 0.87109375, + "learning_rate": 9.133387096774194e-05, + "loss": 0.1551, + "step": 5874 + }, + { + "epoch": 0.094, + "grad_norm": 0.7109375, + "learning_rate": 9.133225806451613e-05, + "loss": 0.1952, + "step": 5875 + }, + { + "epoch": 0.094016, + "grad_norm": 0.62109375, + "learning_rate": 9.133064516129033e-05, + "loss": 0.1714, + "step": 5876 + }, + { + "epoch": 0.094032, + "grad_norm": 0.87109375, + "learning_rate": 9.132903225806451e-05, + "loss": 0.1911, + "step": 5877 + }, + { + "epoch": 0.094048, + "grad_norm": 0.62890625, + "learning_rate": 9.132741935483871e-05, + "loss": 0.1773, + "step": 5878 + }, + { + "epoch": 0.094064, + "grad_norm": 0.88671875, + "learning_rate": 9.132580645161291e-05, + "loss": 0.2192, + "step": 5879 + }, + { + "epoch": 0.09408, + "grad_norm": 1.21875, + "learning_rate": 9.132419354838711e-05, + "loss": 0.1683, + "step": 5880 + }, + { + "epoch": 0.094096, + "grad_norm": 0.76171875, + "learning_rate": 9.13225806451613e-05, + "loss": 0.1776, + "step": 5881 + }, + { + "epoch": 0.094112, + "grad_norm": 0.78515625, + "learning_rate": 9.13209677419355e-05, + "loss": 0.1878, + "step": 5882 + }, + { + "epoch": 0.094128, + "grad_norm": 0.88671875, + "learning_rate": 9.131935483870968e-05, + "loss": 0.1975, + "step": 5883 + }, + { + "epoch": 0.094144, + "grad_norm": 0.90234375, + "learning_rate": 9.131774193548388e-05, + "loss": 0.1769, + "step": 5884 + }, + { + "epoch": 0.09416, + "grad_norm": 0.61328125, + "learning_rate": 9.131612903225807e-05, + "loss": 0.1798, + "step": 5885 + }, + { + "epoch": 0.094176, + "grad_norm": 0.6171875, + "learning_rate": 9.131451612903225e-05, + "loss": 0.1307, + "step": 5886 + }, + { + "epoch": 0.094192, + "grad_norm": 0.796875, + "learning_rate": 9.131290322580645e-05, + "loss": 0.2111, + "step": 5887 + }, + { + "epoch": 0.094208, + "grad_norm": 1.0390625, + "learning_rate": 9.131129032258064e-05, + "loss": 0.1932, + "step": 5888 + }, + { + "epoch": 0.094224, + "grad_norm": 0.703125, + "learning_rate": 9.130967741935484e-05, + "loss": 0.1941, + "step": 5889 + }, + { + "epoch": 0.09424, + "grad_norm": 0.98046875, + "learning_rate": 9.130806451612904e-05, + "loss": 0.2201, + "step": 5890 + }, + { + "epoch": 0.094256, + "grad_norm": 0.828125, + "learning_rate": 9.130645161290324e-05, + "loss": 0.1447, + "step": 5891 + }, + { + "epoch": 0.094272, + "grad_norm": 0.86328125, + "learning_rate": 9.130483870967742e-05, + "loss": 0.2032, + "step": 5892 + }, + { + "epoch": 0.094288, + "grad_norm": 1.390625, + "learning_rate": 9.130322580645162e-05, + "loss": 0.2179, + "step": 5893 + }, + { + "epoch": 0.094304, + "grad_norm": 0.8515625, + "learning_rate": 9.130161290322581e-05, + "loss": 0.1786, + "step": 5894 + }, + { + "epoch": 0.09432, + "grad_norm": 0.6796875, + "learning_rate": 9.130000000000001e-05, + "loss": 0.1744, + "step": 5895 + }, + { + "epoch": 0.094336, + "grad_norm": 0.65625, + "learning_rate": 9.12983870967742e-05, + "loss": 0.1663, + "step": 5896 + }, + { + "epoch": 0.094352, + "grad_norm": 1.40625, + "learning_rate": 9.12967741935484e-05, + "loss": 0.1432, + "step": 5897 + }, + { + "epoch": 0.094368, + "grad_norm": 0.97265625, + "learning_rate": 9.129516129032258e-05, + "loss": 0.1337, + "step": 5898 + }, + { + "epoch": 0.094384, + "grad_norm": 0.796875, + "learning_rate": 9.129354838709678e-05, + "loss": 0.158, + "step": 5899 + }, + { + "epoch": 0.0944, + "grad_norm": 0.69921875, + "learning_rate": 9.129193548387097e-05, + "loss": 0.1771, + "step": 5900 + }, + { + "epoch": 0.094416, + "grad_norm": 0.69140625, + "learning_rate": 9.129032258064517e-05, + "loss": 0.1943, + "step": 5901 + }, + { + "epoch": 0.094432, + "grad_norm": 0.68359375, + "learning_rate": 9.128870967741935e-05, + "loss": 0.1743, + "step": 5902 + }, + { + "epoch": 0.094448, + "grad_norm": 0.69140625, + "learning_rate": 9.128709677419355e-05, + "loss": 0.2073, + "step": 5903 + }, + { + "epoch": 0.094464, + "grad_norm": 0.73828125, + "learning_rate": 9.128548387096775e-05, + "loss": 0.1819, + "step": 5904 + }, + { + "epoch": 0.09448, + "grad_norm": 0.97265625, + "learning_rate": 9.128387096774194e-05, + "loss": 0.2371, + "step": 5905 + }, + { + "epoch": 0.094496, + "grad_norm": 0.98828125, + "learning_rate": 9.128225806451614e-05, + "loss": 0.2188, + "step": 5906 + }, + { + "epoch": 0.094512, + "grad_norm": 1.2578125, + "learning_rate": 9.128064516129032e-05, + "loss": 0.1668, + "step": 5907 + }, + { + "epoch": 0.094528, + "grad_norm": 1.1640625, + "learning_rate": 9.127903225806452e-05, + "loss": 0.2214, + "step": 5908 + }, + { + "epoch": 0.094544, + "grad_norm": 0.51171875, + "learning_rate": 9.127741935483871e-05, + "loss": 0.1372, + "step": 5909 + }, + { + "epoch": 0.09456, + "grad_norm": 1.2890625, + "learning_rate": 9.127580645161291e-05, + "loss": 0.1736, + "step": 5910 + }, + { + "epoch": 0.094576, + "grad_norm": 0.953125, + "learning_rate": 9.12741935483871e-05, + "loss": 0.1607, + "step": 5911 + }, + { + "epoch": 0.094592, + "grad_norm": 0.6640625, + "learning_rate": 9.12725806451613e-05, + "loss": 0.1699, + "step": 5912 + }, + { + "epoch": 0.094608, + "grad_norm": 0.84375, + "learning_rate": 9.127096774193548e-05, + "loss": 0.171, + "step": 5913 + }, + { + "epoch": 0.094624, + "grad_norm": 1.0078125, + "learning_rate": 9.126935483870968e-05, + "loss": 0.1725, + "step": 5914 + }, + { + "epoch": 0.09464, + "grad_norm": 0.86328125, + "learning_rate": 9.126774193548388e-05, + "loss": 0.1719, + "step": 5915 + }, + { + "epoch": 0.094656, + "grad_norm": 0.8515625, + "learning_rate": 9.126612903225808e-05, + "loss": 0.2169, + "step": 5916 + }, + { + "epoch": 0.094672, + "grad_norm": 0.98046875, + "learning_rate": 9.126451612903227e-05, + "loss": 0.1609, + "step": 5917 + }, + { + "epoch": 0.094688, + "grad_norm": 0.8984375, + "learning_rate": 9.126290322580645e-05, + "loss": 0.2024, + "step": 5918 + }, + { + "epoch": 0.094704, + "grad_norm": 0.70703125, + "learning_rate": 9.126129032258065e-05, + "loss": 0.1458, + "step": 5919 + }, + { + "epoch": 0.09472, + "grad_norm": 0.859375, + "learning_rate": 9.125967741935484e-05, + "loss": 0.1611, + "step": 5920 + }, + { + "epoch": 0.094736, + "grad_norm": 0.609375, + "learning_rate": 9.125806451612904e-05, + "loss": 0.1306, + "step": 5921 + }, + { + "epoch": 0.094752, + "grad_norm": 0.8984375, + "learning_rate": 9.125645161290322e-05, + "loss": 0.1588, + "step": 5922 + }, + { + "epoch": 0.094768, + "grad_norm": 0.62109375, + "learning_rate": 9.125483870967742e-05, + "loss": 0.2075, + "step": 5923 + }, + { + "epoch": 0.094784, + "grad_norm": 0.953125, + "learning_rate": 9.125322580645161e-05, + "loss": 0.1838, + "step": 5924 + }, + { + "epoch": 0.0948, + "grad_norm": 0.81640625, + "learning_rate": 9.125161290322581e-05, + "loss": 0.1789, + "step": 5925 + }, + { + "epoch": 0.094816, + "grad_norm": 1.9296875, + "learning_rate": 9.125e-05, + "loss": 0.1564, + "step": 5926 + }, + { + "epoch": 0.094832, + "grad_norm": 1.1484375, + "learning_rate": 9.12483870967742e-05, + "loss": 0.1842, + "step": 5927 + }, + { + "epoch": 0.094848, + "grad_norm": 1.078125, + "learning_rate": 9.12467741935484e-05, + "loss": 0.1695, + "step": 5928 + }, + { + "epoch": 0.094864, + "grad_norm": 1.421875, + "learning_rate": 9.12451612903226e-05, + "loss": 0.1796, + "step": 5929 + }, + { + "epoch": 0.09488, + "grad_norm": 1.3125, + "learning_rate": 9.124354838709678e-05, + "loss": 0.1781, + "step": 5930 + }, + { + "epoch": 0.094896, + "grad_norm": 0.9921875, + "learning_rate": 9.124193548387098e-05, + "loss": 0.1913, + "step": 5931 + }, + { + "epoch": 0.094912, + "grad_norm": 0.69921875, + "learning_rate": 9.124032258064517e-05, + "loss": 0.1324, + "step": 5932 + }, + { + "epoch": 0.094928, + "grad_norm": 1.234375, + "learning_rate": 9.123870967741935e-05, + "loss": 0.1607, + "step": 5933 + }, + { + "epoch": 0.094944, + "grad_norm": 1.03125, + "learning_rate": 9.123709677419355e-05, + "loss": 0.2044, + "step": 5934 + }, + { + "epoch": 0.09496, + "grad_norm": 0.61328125, + "learning_rate": 9.123548387096774e-05, + "loss": 0.1731, + "step": 5935 + }, + { + "epoch": 0.094976, + "grad_norm": 0.93359375, + "learning_rate": 9.123387096774194e-05, + "loss": 0.1831, + "step": 5936 + }, + { + "epoch": 0.094992, + "grad_norm": 0.83984375, + "learning_rate": 9.123225806451612e-05, + "loss": 0.1614, + "step": 5937 + }, + { + "epoch": 0.095008, + "grad_norm": 0.71484375, + "learning_rate": 9.123064516129032e-05, + "loss": 0.1792, + "step": 5938 + }, + { + "epoch": 0.095024, + "grad_norm": 0.87890625, + "learning_rate": 9.122903225806452e-05, + "loss": 0.1973, + "step": 5939 + }, + { + "epoch": 0.09504, + "grad_norm": 1.1796875, + "learning_rate": 9.122741935483872e-05, + "loss": 0.1576, + "step": 5940 + }, + { + "epoch": 0.095056, + "grad_norm": 1.03125, + "learning_rate": 9.122580645161291e-05, + "loss": 0.1701, + "step": 5941 + }, + { + "epoch": 0.095072, + "grad_norm": 0.84375, + "learning_rate": 9.122419354838711e-05, + "loss": 0.1383, + "step": 5942 + }, + { + "epoch": 0.095088, + "grad_norm": 0.6484375, + "learning_rate": 9.12225806451613e-05, + "loss": 0.1808, + "step": 5943 + }, + { + "epoch": 0.095104, + "grad_norm": 0.86328125, + "learning_rate": 9.12209677419355e-05, + "loss": 0.1895, + "step": 5944 + }, + { + "epoch": 0.09512, + "grad_norm": 0.875, + "learning_rate": 9.121935483870968e-05, + "loss": 0.1863, + "step": 5945 + }, + { + "epoch": 0.095136, + "grad_norm": 0.87890625, + "learning_rate": 9.121774193548388e-05, + "loss": 0.2083, + "step": 5946 + }, + { + "epoch": 0.095152, + "grad_norm": 0.765625, + "learning_rate": 9.121612903225807e-05, + "loss": 0.1909, + "step": 5947 + }, + { + "epoch": 0.095168, + "grad_norm": 0.59765625, + "learning_rate": 9.121451612903225e-05, + "loss": 0.1691, + "step": 5948 + }, + { + "epoch": 0.095184, + "grad_norm": 0.9375, + "learning_rate": 9.121290322580645e-05, + "loss": 0.1883, + "step": 5949 + }, + { + "epoch": 0.0952, + "grad_norm": 0.53125, + "learning_rate": 9.121129032258065e-05, + "loss": 0.1614, + "step": 5950 + }, + { + "epoch": 0.095216, + "grad_norm": 0.87890625, + "learning_rate": 9.120967741935485e-05, + "loss": 0.1968, + "step": 5951 + }, + { + "epoch": 0.095232, + "grad_norm": 0.74609375, + "learning_rate": 9.120806451612904e-05, + "loss": 0.1776, + "step": 5952 + }, + { + "epoch": 0.095248, + "grad_norm": 0.83984375, + "learning_rate": 9.120645161290324e-05, + "loss": 0.1824, + "step": 5953 + }, + { + "epoch": 0.095264, + "grad_norm": 0.9765625, + "learning_rate": 9.120483870967742e-05, + "loss": 0.2348, + "step": 5954 + }, + { + "epoch": 0.09528, + "grad_norm": 0.546875, + "learning_rate": 9.120322580645162e-05, + "loss": 0.1319, + "step": 5955 + }, + { + "epoch": 0.095296, + "grad_norm": 0.8984375, + "learning_rate": 9.120161290322581e-05, + "loss": 0.1926, + "step": 5956 + }, + { + "epoch": 0.095312, + "grad_norm": 0.765625, + "learning_rate": 9.120000000000001e-05, + "loss": 0.1808, + "step": 5957 + }, + { + "epoch": 0.095328, + "grad_norm": 0.6484375, + "learning_rate": 9.11983870967742e-05, + "loss": 0.1971, + "step": 5958 + }, + { + "epoch": 0.095344, + "grad_norm": 0.8203125, + "learning_rate": 9.11967741935484e-05, + "loss": 0.1895, + "step": 5959 + }, + { + "epoch": 0.09536, + "grad_norm": 0.60546875, + "learning_rate": 9.119516129032258e-05, + "loss": 0.1942, + "step": 5960 + }, + { + "epoch": 0.095376, + "grad_norm": 1.140625, + "learning_rate": 9.119354838709678e-05, + "loss": 0.2248, + "step": 5961 + }, + { + "epoch": 0.095392, + "grad_norm": 0.60546875, + "learning_rate": 9.119193548387097e-05, + "loss": 0.1644, + "step": 5962 + }, + { + "epoch": 0.095408, + "grad_norm": 0.984375, + "learning_rate": 9.119032258064516e-05, + "loss": 0.1922, + "step": 5963 + }, + { + "epoch": 0.095424, + "grad_norm": 1.3515625, + "learning_rate": 9.118870967741936e-05, + "loss": 0.2122, + "step": 5964 + }, + { + "epoch": 0.09544, + "grad_norm": 0.546875, + "learning_rate": 9.118709677419355e-05, + "loss": 0.1476, + "step": 5965 + }, + { + "epoch": 0.095456, + "grad_norm": 0.79296875, + "learning_rate": 9.118548387096775e-05, + "loss": 0.1991, + "step": 5966 + }, + { + "epoch": 0.095472, + "grad_norm": 0.54296875, + "learning_rate": 9.118387096774194e-05, + "loss": 0.139, + "step": 5967 + }, + { + "epoch": 0.095488, + "grad_norm": 1.5234375, + "learning_rate": 9.118225806451614e-05, + "loss": 0.1764, + "step": 5968 + }, + { + "epoch": 0.095504, + "grad_norm": 0.95703125, + "learning_rate": 9.118064516129032e-05, + "loss": 0.2027, + "step": 5969 + }, + { + "epoch": 0.09552, + "grad_norm": 0.78515625, + "learning_rate": 9.117903225806452e-05, + "loss": 0.1622, + "step": 5970 + }, + { + "epoch": 0.095536, + "grad_norm": 1.234375, + "learning_rate": 9.117741935483871e-05, + "loss": 0.1828, + "step": 5971 + }, + { + "epoch": 0.095552, + "grad_norm": 0.7890625, + "learning_rate": 9.117580645161291e-05, + "loss": 0.2173, + "step": 5972 + }, + { + "epoch": 0.095568, + "grad_norm": 0.95703125, + "learning_rate": 9.11741935483871e-05, + "loss": 0.1431, + "step": 5973 + }, + { + "epoch": 0.095584, + "grad_norm": 0.75, + "learning_rate": 9.117258064516129e-05, + "loss": 0.1713, + "step": 5974 + }, + { + "epoch": 0.0956, + "grad_norm": 0.96484375, + "learning_rate": 9.117096774193549e-05, + "loss": 0.1704, + "step": 5975 + }, + { + "epoch": 0.095616, + "grad_norm": 0.67578125, + "learning_rate": 9.116935483870969e-05, + "loss": 0.1544, + "step": 5976 + }, + { + "epoch": 0.095632, + "grad_norm": 1.2734375, + "learning_rate": 9.116774193548388e-05, + "loss": 0.2166, + "step": 5977 + }, + { + "epoch": 0.095648, + "grad_norm": 1.0546875, + "learning_rate": 9.116612903225808e-05, + "loss": 0.1913, + "step": 5978 + }, + { + "epoch": 0.095664, + "grad_norm": 0.88671875, + "learning_rate": 9.116451612903226e-05, + "loss": 0.1979, + "step": 5979 + }, + { + "epoch": 0.09568, + "grad_norm": 0.85546875, + "learning_rate": 9.116290322580645e-05, + "loss": 0.1918, + "step": 5980 + }, + { + "epoch": 0.095696, + "grad_norm": 0.82421875, + "learning_rate": 9.116129032258065e-05, + "loss": 0.143, + "step": 5981 + }, + { + "epoch": 0.095712, + "grad_norm": 1.1796875, + "learning_rate": 9.115967741935484e-05, + "loss": 0.214, + "step": 5982 + }, + { + "epoch": 0.095728, + "grad_norm": 0.93359375, + "learning_rate": 9.115806451612904e-05, + "loss": 0.2083, + "step": 5983 + }, + { + "epoch": 0.095744, + "grad_norm": 0.890625, + "learning_rate": 9.115645161290322e-05, + "loss": 0.16, + "step": 5984 + }, + { + "epoch": 0.09576, + "grad_norm": 0.77734375, + "learning_rate": 9.115483870967742e-05, + "loss": 0.1742, + "step": 5985 + }, + { + "epoch": 0.095776, + "grad_norm": 1.140625, + "learning_rate": 9.115322580645162e-05, + "loss": 0.1614, + "step": 5986 + }, + { + "epoch": 0.095792, + "grad_norm": 1.0546875, + "learning_rate": 9.115161290322582e-05, + "loss": 0.17, + "step": 5987 + }, + { + "epoch": 0.095808, + "grad_norm": 0.68359375, + "learning_rate": 9.115e-05, + "loss": 0.1548, + "step": 5988 + }, + { + "epoch": 0.095824, + "grad_norm": 0.97265625, + "learning_rate": 9.11483870967742e-05, + "loss": 0.1787, + "step": 5989 + }, + { + "epoch": 0.09584, + "grad_norm": 0.671875, + "learning_rate": 9.114677419354839e-05, + "loss": 0.1604, + "step": 5990 + }, + { + "epoch": 0.095856, + "grad_norm": 0.6640625, + "learning_rate": 9.114516129032259e-05, + "loss": 0.1393, + "step": 5991 + }, + { + "epoch": 0.095872, + "grad_norm": 0.5, + "learning_rate": 9.114354838709678e-05, + "loss": 0.1708, + "step": 5992 + }, + { + "epoch": 0.095888, + "grad_norm": 0.76953125, + "learning_rate": 9.114193548387098e-05, + "loss": 0.1768, + "step": 5993 + }, + { + "epoch": 0.095904, + "grad_norm": 0.9296875, + "learning_rate": 9.114032258064516e-05, + "loss": 0.1449, + "step": 5994 + }, + { + "epoch": 0.09592, + "grad_norm": 0.765625, + "learning_rate": 9.113870967741935e-05, + "loss": 0.182, + "step": 5995 + }, + { + "epoch": 0.095936, + "grad_norm": 0.95703125, + "learning_rate": 9.113709677419355e-05, + "loss": 0.1778, + "step": 5996 + }, + { + "epoch": 0.095952, + "grad_norm": 0.89453125, + "learning_rate": 9.113548387096774e-05, + "loss": 0.1851, + "step": 5997 + }, + { + "epoch": 0.095968, + "grad_norm": 1.4296875, + "learning_rate": 9.113387096774194e-05, + "loss": 0.1592, + "step": 5998 + }, + { + "epoch": 0.095984, + "grad_norm": 1.0078125, + "learning_rate": 9.113225806451613e-05, + "loss": 0.1888, + "step": 5999 + }, + { + "epoch": 0.096, + "grad_norm": 0.8984375, + "learning_rate": 9.113064516129033e-05, + "loss": 0.1546, + "step": 6000 + }, + { + "epoch": 0.096016, + "grad_norm": 0.65234375, + "learning_rate": 9.112903225806452e-05, + "loss": 0.1908, + "step": 6001 + }, + { + "epoch": 0.096032, + "grad_norm": 0.80078125, + "learning_rate": 9.112741935483872e-05, + "loss": 0.2097, + "step": 6002 + }, + { + "epoch": 0.096048, + "grad_norm": 0.9140625, + "learning_rate": 9.11258064516129e-05, + "loss": 0.176, + "step": 6003 + }, + { + "epoch": 0.096064, + "grad_norm": 0.62890625, + "learning_rate": 9.11241935483871e-05, + "loss": 0.1585, + "step": 6004 + }, + { + "epoch": 0.09608, + "grad_norm": 0.78515625, + "learning_rate": 9.112258064516129e-05, + "loss": 0.138, + "step": 6005 + }, + { + "epoch": 0.096096, + "grad_norm": 1.03125, + "learning_rate": 9.112096774193549e-05, + "loss": 0.2027, + "step": 6006 + }, + { + "epoch": 0.096112, + "grad_norm": 1.1953125, + "learning_rate": 9.111935483870968e-05, + "loss": 0.1561, + "step": 6007 + }, + { + "epoch": 0.096128, + "grad_norm": 0.77734375, + "learning_rate": 9.111774193548388e-05, + "loss": 0.1582, + "step": 6008 + }, + { + "epoch": 0.096144, + "grad_norm": 0.82421875, + "learning_rate": 9.111612903225806e-05, + "loss": 0.1837, + "step": 6009 + }, + { + "epoch": 0.09616, + "grad_norm": 0.890625, + "learning_rate": 9.111451612903226e-05, + "loss": 0.1422, + "step": 6010 + }, + { + "epoch": 0.096176, + "grad_norm": 1.1484375, + "learning_rate": 9.111290322580646e-05, + "loss": 0.1781, + "step": 6011 + }, + { + "epoch": 0.096192, + "grad_norm": 0.7578125, + "learning_rate": 9.111129032258065e-05, + "loss": 0.1318, + "step": 6012 + }, + { + "epoch": 0.096208, + "grad_norm": 0.96484375, + "learning_rate": 9.110967741935485e-05, + "loss": 0.1785, + "step": 6013 + }, + { + "epoch": 0.096224, + "grad_norm": 0.96484375, + "learning_rate": 9.110806451612903e-05, + "loss": 0.1822, + "step": 6014 + }, + { + "epoch": 0.09624, + "grad_norm": 0.83984375, + "learning_rate": 9.110645161290323e-05, + "loss": 0.1661, + "step": 6015 + }, + { + "epoch": 0.096256, + "grad_norm": 0.64453125, + "learning_rate": 9.110483870967742e-05, + "loss": 0.1586, + "step": 6016 + }, + { + "epoch": 0.096272, + "grad_norm": 1.53125, + "learning_rate": 9.110322580645162e-05, + "loss": 0.2054, + "step": 6017 + }, + { + "epoch": 0.096288, + "grad_norm": 0.7578125, + "learning_rate": 9.11016129032258e-05, + "loss": 0.1677, + "step": 6018 + }, + { + "epoch": 0.096304, + "grad_norm": 0.61328125, + "learning_rate": 9.11e-05, + "loss": 0.207, + "step": 6019 + }, + { + "epoch": 0.09632, + "grad_norm": 0.71484375, + "learning_rate": 9.109838709677419e-05, + "loss": 0.1856, + "step": 6020 + }, + { + "epoch": 0.096336, + "grad_norm": 1.40625, + "learning_rate": 9.109677419354839e-05, + "loss": 0.1964, + "step": 6021 + }, + { + "epoch": 0.096352, + "grad_norm": 0.6953125, + "learning_rate": 9.109516129032258e-05, + "loss": 0.1806, + "step": 6022 + }, + { + "epoch": 0.096368, + "grad_norm": 0.64453125, + "learning_rate": 9.109354838709678e-05, + "loss": 0.1248, + "step": 6023 + }, + { + "epoch": 0.096384, + "grad_norm": 0.78515625, + "learning_rate": 9.109193548387098e-05, + "loss": 0.1624, + "step": 6024 + }, + { + "epoch": 0.0964, + "grad_norm": 1.53125, + "learning_rate": 9.109032258064518e-05, + "loss": 0.2008, + "step": 6025 + }, + { + "epoch": 0.096416, + "grad_norm": 0.85546875, + "learning_rate": 9.108870967741936e-05, + "loss": 0.1594, + "step": 6026 + }, + { + "epoch": 0.096432, + "grad_norm": 1.53125, + "learning_rate": 9.108709677419355e-05, + "loss": 0.2201, + "step": 6027 + }, + { + "epoch": 0.096448, + "grad_norm": 0.90625, + "learning_rate": 9.108548387096775e-05, + "loss": 0.1806, + "step": 6028 + }, + { + "epoch": 0.096464, + "grad_norm": 0.96484375, + "learning_rate": 9.108387096774193e-05, + "loss": 0.2048, + "step": 6029 + }, + { + "epoch": 0.09648, + "grad_norm": 0.61328125, + "learning_rate": 9.108225806451613e-05, + "loss": 0.1476, + "step": 6030 + }, + { + "epoch": 0.096496, + "grad_norm": 1.0234375, + "learning_rate": 9.108064516129032e-05, + "loss": 0.1416, + "step": 6031 + }, + { + "epoch": 0.096512, + "grad_norm": 0.66015625, + "learning_rate": 9.107903225806452e-05, + "loss": 0.1563, + "step": 6032 + }, + { + "epoch": 0.096528, + "grad_norm": 0.81640625, + "learning_rate": 9.10774193548387e-05, + "loss": 0.2091, + "step": 6033 + }, + { + "epoch": 0.096544, + "grad_norm": 0.67578125, + "learning_rate": 9.10758064516129e-05, + "loss": 0.1774, + "step": 6034 + }, + { + "epoch": 0.09656, + "grad_norm": 0.8671875, + "learning_rate": 9.10741935483871e-05, + "loss": 0.1886, + "step": 6035 + }, + { + "epoch": 0.096576, + "grad_norm": 1.09375, + "learning_rate": 9.10725806451613e-05, + "loss": 0.1514, + "step": 6036 + }, + { + "epoch": 0.096592, + "grad_norm": 1.015625, + "learning_rate": 9.107096774193549e-05, + "loss": 0.2002, + "step": 6037 + }, + { + "epoch": 0.096608, + "grad_norm": 1.15625, + "learning_rate": 9.106935483870969e-05, + "loss": 0.1953, + "step": 6038 + }, + { + "epoch": 0.096624, + "grad_norm": 0.8046875, + "learning_rate": 9.106774193548388e-05, + "loss": 0.17, + "step": 6039 + }, + { + "epoch": 0.09664, + "grad_norm": 0.81640625, + "learning_rate": 9.106612903225808e-05, + "loss": 0.1861, + "step": 6040 + }, + { + "epoch": 0.096656, + "grad_norm": 1.0234375, + "learning_rate": 9.106451612903226e-05, + "loss": 0.1628, + "step": 6041 + }, + { + "epoch": 0.096672, + "grad_norm": 0.921875, + "learning_rate": 9.106290322580645e-05, + "loss": 0.1833, + "step": 6042 + }, + { + "epoch": 0.096688, + "grad_norm": 1.5, + "learning_rate": 9.106129032258065e-05, + "loss": 0.1459, + "step": 6043 + }, + { + "epoch": 0.096704, + "grad_norm": 0.62890625, + "learning_rate": 9.105967741935483e-05, + "loss": 0.1665, + "step": 6044 + }, + { + "epoch": 0.09672, + "grad_norm": 0.671875, + "learning_rate": 9.105806451612903e-05, + "loss": 0.1751, + "step": 6045 + }, + { + "epoch": 0.096736, + "grad_norm": 1.265625, + "learning_rate": 9.105645161290323e-05, + "loss": 0.1961, + "step": 6046 + }, + { + "epoch": 0.096752, + "grad_norm": 0.56640625, + "learning_rate": 9.105483870967743e-05, + "loss": 0.1687, + "step": 6047 + }, + { + "epoch": 0.096768, + "grad_norm": 0.93359375, + "learning_rate": 9.105322580645162e-05, + "loss": 0.1824, + "step": 6048 + }, + { + "epoch": 0.096784, + "grad_norm": 1.0390625, + "learning_rate": 9.105161290322582e-05, + "loss": 0.1641, + "step": 6049 + }, + { + "epoch": 0.0968, + "grad_norm": 0.74609375, + "learning_rate": 9.105e-05, + "loss": 0.13, + "step": 6050 + }, + { + "epoch": 0.096816, + "grad_norm": 0.93359375, + "learning_rate": 9.10483870967742e-05, + "loss": 0.1575, + "step": 6051 + }, + { + "epoch": 0.096832, + "grad_norm": 0.73046875, + "learning_rate": 9.104677419354839e-05, + "loss": 0.151, + "step": 6052 + }, + { + "epoch": 0.096848, + "grad_norm": 1.0390625, + "learning_rate": 9.104516129032259e-05, + "loss": 0.172, + "step": 6053 + }, + { + "epoch": 0.096864, + "grad_norm": 0.57421875, + "learning_rate": 9.104354838709678e-05, + "loss": 0.1709, + "step": 6054 + }, + { + "epoch": 0.09688, + "grad_norm": 0.671875, + "learning_rate": 9.104193548387098e-05, + "loss": 0.1448, + "step": 6055 + }, + { + "epoch": 0.096896, + "grad_norm": 0.8046875, + "learning_rate": 9.104032258064516e-05, + "loss": 0.1905, + "step": 6056 + }, + { + "epoch": 0.096912, + "grad_norm": 0.6171875, + "learning_rate": 9.103870967741935e-05, + "loss": 0.1814, + "step": 6057 + }, + { + "epoch": 0.096928, + "grad_norm": 1.40625, + "learning_rate": 9.103709677419355e-05, + "loss": 0.1994, + "step": 6058 + }, + { + "epoch": 0.096944, + "grad_norm": 1.1640625, + "learning_rate": 9.103548387096775e-05, + "loss": 0.2193, + "step": 6059 + }, + { + "epoch": 0.09696, + "grad_norm": 1.109375, + "learning_rate": 9.103387096774195e-05, + "loss": 0.1589, + "step": 6060 + }, + { + "epoch": 0.096976, + "grad_norm": 0.69921875, + "learning_rate": 9.103225806451613e-05, + "loss": 0.2071, + "step": 6061 + }, + { + "epoch": 0.096992, + "grad_norm": 1.203125, + "learning_rate": 9.103064516129033e-05, + "loss": 0.1666, + "step": 6062 + }, + { + "epoch": 0.097008, + "grad_norm": 1.34375, + "learning_rate": 9.102903225806452e-05, + "loss": 0.1884, + "step": 6063 + }, + { + "epoch": 0.097024, + "grad_norm": 0.86328125, + "learning_rate": 9.102741935483872e-05, + "loss": 0.1477, + "step": 6064 + }, + { + "epoch": 0.09704, + "grad_norm": 0.72265625, + "learning_rate": 9.10258064516129e-05, + "loss": 0.1542, + "step": 6065 + }, + { + "epoch": 0.097056, + "grad_norm": 1.046875, + "learning_rate": 9.10241935483871e-05, + "loss": 0.1838, + "step": 6066 + }, + { + "epoch": 0.097072, + "grad_norm": 0.6484375, + "learning_rate": 9.102258064516129e-05, + "loss": 0.1652, + "step": 6067 + }, + { + "epoch": 0.097088, + "grad_norm": 0.59765625, + "learning_rate": 9.102096774193549e-05, + "loss": 0.151, + "step": 6068 + }, + { + "epoch": 0.097104, + "grad_norm": 0.59375, + "learning_rate": 9.101935483870968e-05, + "loss": 0.1528, + "step": 6069 + }, + { + "epoch": 0.09712, + "grad_norm": 0.73828125, + "learning_rate": 9.101774193548387e-05, + "loss": 0.1663, + "step": 6070 + }, + { + "epoch": 0.097136, + "grad_norm": 0.79296875, + "learning_rate": 9.101612903225807e-05, + "loss": 0.1854, + "step": 6071 + }, + { + "epoch": 0.097152, + "grad_norm": 0.91015625, + "learning_rate": 9.101451612903227e-05, + "loss": 0.1905, + "step": 6072 + }, + { + "epoch": 0.097168, + "grad_norm": 0.9140625, + "learning_rate": 9.101290322580646e-05, + "loss": 0.1837, + "step": 6073 + }, + { + "epoch": 0.097184, + "grad_norm": 0.8984375, + "learning_rate": 9.101129032258065e-05, + "loss": 0.2155, + "step": 6074 + }, + { + "epoch": 0.0972, + "grad_norm": 0.78515625, + "learning_rate": 9.100967741935485e-05, + "loss": 0.1645, + "step": 6075 + }, + { + "epoch": 0.097216, + "grad_norm": 1.328125, + "learning_rate": 9.100806451612903e-05, + "loss": 0.1665, + "step": 6076 + }, + { + "epoch": 0.097232, + "grad_norm": 0.6484375, + "learning_rate": 9.100645161290323e-05, + "loss": 0.1368, + "step": 6077 + }, + { + "epoch": 0.097248, + "grad_norm": 1.2265625, + "learning_rate": 9.100483870967742e-05, + "loss": 0.1803, + "step": 6078 + }, + { + "epoch": 0.097264, + "grad_norm": 0.7109375, + "learning_rate": 9.100322580645162e-05, + "loss": 0.1768, + "step": 6079 + }, + { + "epoch": 0.09728, + "grad_norm": 1.171875, + "learning_rate": 9.10016129032258e-05, + "loss": 0.2028, + "step": 6080 + }, + { + "epoch": 0.097296, + "grad_norm": 0.61328125, + "learning_rate": 9.1e-05, + "loss": 0.1718, + "step": 6081 + }, + { + "epoch": 0.097312, + "grad_norm": 1.0859375, + "learning_rate": 9.09983870967742e-05, + "loss": 0.1899, + "step": 6082 + }, + { + "epoch": 0.097328, + "grad_norm": 1.359375, + "learning_rate": 9.099677419354839e-05, + "loss": 0.1907, + "step": 6083 + }, + { + "epoch": 0.097344, + "grad_norm": 0.96484375, + "learning_rate": 9.099516129032259e-05, + "loss": 0.1854, + "step": 6084 + }, + { + "epoch": 0.09736, + "grad_norm": 0.89453125, + "learning_rate": 9.099354838709679e-05, + "loss": 0.2252, + "step": 6085 + }, + { + "epoch": 0.097376, + "grad_norm": 0.609375, + "learning_rate": 9.099193548387097e-05, + "loss": 0.1813, + "step": 6086 + }, + { + "epoch": 0.097392, + "grad_norm": 0.71484375, + "learning_rate": 9.099032258064517e-05, + "loss": 0.1982, + "step": 6087 + }, + { + "epoch": 0.097408, + "grad_norm": 1.3515625, + "learning_rate": 9.098870967741936e-05, + "loss": 0.1706, + "step": 6088 + }, + { + "epoch": 0.097424, + "grad_norm": 1.6484375, + "learning_rate": 9.098709677419355e-05, + "loss": 0.2105, + "step": 6089 + }, + { + "epoch": 0.09744, + "grad_norm": 0.62109375, + "learning_rate": 9.098548387096775e-05, + "loss": 0.1701, + "step": 6090 + }, + { + "epoch": 0.097456, + "grad_norm": 0.94921875, + "learning_rate": 9.098387096774193e-05, + "loss": 0.1724, + "step": 6091 + }, + { + "epoch": 0.097472, + "grad_norm": 0.58203125, + "learning_rate": 9.098225806451613e-05, + "loss": 0.1735, + "step": 6092 + }, + { + "epoch": 0.097488, + "grad_norm": 0.89453125, + "learning_rate": 9.098064516129032e-05, + "loss": 0.1915, + "step": 6093 + }, + { + "epoch": 0.097504, + "grad_norm": 0.73828125, + "learning_rate": 9.097903225806452e-05, + "loss": 0.1756, + "step": 6094 + }, + { + "epoch": 0.09752, + "grad_norm": 1.1328125, + "learning_rate": 9.097741935483872e-05, + "loss": 0.1527, + "step": 6095 + }, + { + "epoch": 0.097536, + "grad_norm": 0.90234375, + "learning_rate": 9.097580645161292e-05, + "loss": 0.1544, + "step": 6096 + }, + { + "epoch": 0.097552, + "grad_norm": 0.68359375, + "learning_rate": 9.09741935483871e-05, + "loss": 0.1576, + "step": 6097 + }, + { + "epoch": 0.097568, + "grad_norm": 0.76953125, + "learning_rate": 9.09725806451613e-05, + "loss": 0.1691, + "step": 6098 + }, + { + "epoch": 0.097584, + "grad_norm": 0.99609375, + "learning_rate": 9.097096774193549e-05, + "loss": 0.1673, + "step": 6099 + }, + { + "epoch": 0.0976, + "grad_norm": 1.1640625, + "learning_rate": 9.096935483870969e-05, + "loss": 0.1769, + "step": 6100 + }, + { + "epoch": 0.097616, + "grad_norm": 0.734375, + "learning_rate": 9.096774193548387e-05, + "loss": 0.1708, + "step": 6101 + }, + { + "epoch": 0.097632, + "grad_norm": 0.796875, + "learning_rate": 9.096612903225807e-05, + "loss": 0.181, + "step": 6102 + }, + { + "epoch": 0.097648, + "grad_norm": 0.6796875, + "learning_rate": 9.096451612903226e-05, + "loss": 0.1872, + "step": 6103 + }, + { + "epoch": 0.097664, + "grad_norm": 0.859375, + "learning_rate": 9.096290322580645e-05, + "loss": 0.1883, + "step": 6104 + }, + { + "epoch": 0.09768, + "grad_norm": 1.0390625, + "learning_rate": 9.096129032258064e-05, + "loss": 0.1936, + "step": 6105 + }, + { + "epoch": 0.097696, + "grad_norm": 0.96875, + "learning_rate": 9.095967741935484e-05, + "loss": 0.1944, + "step": 6106 + }, + { + "epoch": 0.097712, + "grad_norm": 0.87109375, + "learning_rate": 9.095806451612904e-05, + "loss": 0.1852, + "step": 6107 + }, + { + "epoch": 0.097728, + "grad_norm": 0.86328125, + "learning_rate": 9.095645161290323e-05, + "loss": 0.1909, + "step": 6108 + }, + { + "epoch": 0.097744, + "grad_norm": 0.8515625, + "learning_rate": 9.095483870967743e-05, + "loss": 0.2392, + "step": 6109 + }, + { + "epoch": 0.09776, + "grad_norm": 0.7734375, + "learning_rate": 9.095322580645162e-05, + "loss": 0.1701, + "step": 6110 + }, + { + "epoch": 0.097776, + "grad_norm": 1.203125, + "learning_rate": 9.095161290322582e-05, + "loss": 0.1633, + "step": 6111 + }, + { + "epoch": 0.097792, + "grad_norm": 0.65625, + "learning_rate": 9.095e-05, + "loss": 0.187, + "step": 6112 + }, + { + "epoch": 0.097808, + "grad_norm": 0.9375, + "learning_rate": 9.09483870967742e-05, + "loss": 0.1613, + "step": 6113 + }, + { + "epoch": 0.097824, + "grad_norm": 0.72265625, + "learning_rate": 9.094677419354839e-05, + "loss": 0.1684, + "step": 6114 + }, + { + "epoch": 0.09784, + "grad_norm": 0.54296875, + "learning_rate": 9.094516129032259e-05, + "loss": 0.1681, + "step": 6115 + }, + { + "epoch": 0.097856, + "grad_norm": 1.09375, + "learning_rate": 9.094354838709677e-05, + "loss": 0.1939, + "step": 6116 + }, + { + "epoch": 0.097872, + "grad_norm": 0.6640625, + "learning_rate": 9.094193548387097e-05, + "loss": 0.1848, + "step": 6117 + }, + { + "epoch": 0.097888, + "grad_norm": 1.0234375, + "learning_rate": 9.094032258064516e-05, + "loss": 0.2167, + "step": 6118 + }, + { + "epoch": 0.097904, + "grad_norm": 1.015625, + "learning_rate": 9.093870967741936e-05, + "loss": 0.1619, + "step": 6119 + }, + { + "epoch": 0.09792, + "grad_norm": 0.91015625, + "learning_rate": 9.093709677419356e-05, + "loss": 0.2358, + "step": 6120 + }, + { + "epoch": 0.097936, + "grad_norm": 0.56640625, + "learning_rate": 9.093548387096774e-05, + "loss": 0.1653, + "step": 6121 + }, + { + "epoch": 0.097952, + "grad_norm": 0.68359375, + "learning_rate": 9.093387096774194e-05, + "loss": 0.1716, + "step": 6122 + }, + { + "epoch": 0.097968, + "grad_norm": 0.6015625, + "learning_rate": 9.093225806451613e-05, + "loss": 0.1551, + "step": 6123 + }, + { + "epoch": 0.097984, + "grad_norm": 0.9453125, + "learning_rate": 9.093064516129033e-05, + "loss": 0.1872, + "step": 6124 + }, + { + "epoch": 0.098, + "grad_norm": 1.0, + "learning_rate": 9.092903225806452e-05, + "loss": 0.2084, + "step": 6125 + }, + { + "epoch": 0.098016, + "grad_norm": 0.66796875, + "learning_rate": 9.092741935483872e-05, + "loss": 0.2106, + "step": 6126 + }, + { + "epoch": 0.098032, + "grad_norm": 0.8203125, + "learning_rate": 9.09258064516129e-05, + "loss": 0.1936, + "step": 6127 + }, + { + "epoch": 0.098048, + "grad_norm": 0.56640625, + "learning_rate": 9.09241935483871e-05, + "loss": 0.1528, + "step": 6128 + }, + { + "epoch": 0.098064, + "grad_norm": 0.62109375, + "learning_rate": 9.092258064516129e-05, + "loss": 0.1583, + "step": 6129 + }, + { + "epoch": 0.09808, + "grad_norm": 1.125, + "learning_rate": 9.092096774193549e-05, + "loss": 0.2273, + "step": 6130 + }, + { + "epoch": 0.098096, + "grad_norm": 1.0859375, + "learning_rate": 9.091935483870969e-05, + "loss": 0.1959, + "step": 6131 + }, + { + "epoch": 0.098112, + "grad_norm": 1.109375, + "learning_rate": 9.091774193548389e-05, + "loss": 0.2293, + "step": 6132 + }, + { + "epoch": 0.098128, + "grad_norm": 0.59375, + "learning_rate": 9.091612903225807e-05, + "loss": 0.1472, + "step": 6133 + }, + { + "epoch": 0.098144, + "grad_norm": 0.63671875, + "learning_rate": 9.091451612903227e-05, + "loss": 0.1838, + "step": 6134 + }, + { + "epoch": 0.09816, + "grad_norm": 1.0, + "learning_rate": 9.091290322580646e-05, + "loss": 0.2142, + "step": 6135 + }, + { + "epoch": 0.098176, + "grad_norm": 0.9921875, + "learning_rate": 9.091129032258064e-05, + "loss": 0.1524, + "step": 6136 + }, + { + "epoch": 0.098192, + "grad_norm": 0.90234375, + "learning_rate": 9.090967741935484e-05, + "loss": 0.2179, + "step": 6137 + }, + { + "epoch": 0.098208, + "grad_norm": 0.8515625, + "learning_rate": 9.090806451612903e-05, + "loss": 0.1867, + "step": 6138 + }, + { + "epoch": 0.098224, + "grad_norm": 1.0, + "learning_rate": 9.090645161290323e-05, + "loss": 0.211, + "step": 6139 + }, + { + "epoch": 0.09824, + "grad_norm": 0.73828125, + "learning_rate": 9.090483870967742e-05, + "loss": 0.1682, + "step": 6140 + }, + { + "epoch": 0.098256, + "grad_norm": 0.796875, + "learning_rate": 9.090322580645161e-05, + "loss": 0.1805, + "step": 6141 + }, + { + "epoch": 0.098272, + "grad_norm": 0.62890625, + "learning_rate": 9.090161290322581e-05, + "loss": 0.183, + "step": 6142 + }, + { + "epoch": 0.098288, + "grad_norm": 0.875, + "learning_rate": 9.090000000000001e-05, + "loss": 0.1627, + "step": 6143 + }, + { + "epoch": 0.098304, + "grad_norm": 0.70703125, + "learning_rate": 9.08983870967742e-05, + "loss": 0.2007, + "step": 6144 + }, + { + "epoch": 0.09832, + "grad_norm": 0.70703125, + "learning_rate": 9.08967741935484e-05, + "loss": 0.1535, + "step": 6145 + }, + { + "epoch": 0.098336, + "grad_norm": 0.85546875, + "learning_rate": 9.089516129032259e-05, + "loss": 0.2155, + "step": 6146 + }, + { + "epoch": 0.098352, + "grad_norm": 1.5625, + "learning_rate": 9.089354838709679e-05, + "loss": 0.1835, + "step": 6147 + }, + { + "epoch": 0.098368, + "grad_norm": 0.8046875, + "learning_rate": 9.089193548387097e-05, + "loss": 0.1514, + "step": 6148 + }, + { + "epoch": 0.098384, + "grad_norm": 1.1015625, + "learning_rate": 9.089032258064517e-05, + "loss": 0.1958, + "step": 6149 + }, + { + "epoch": 0.0984, + "grad_norm": 0.69140625, + "learning_rate": 9.088870967741936e-05, + "loss": 0.1609, + "step": 6150 + }, + { + "epoch": 0.098416, + "grad_norm": 1.2734375, + "learning_rate": 9.088709677419354e-05, + "loss": 0.19, + "step": 6151 + }, + { + "epoch": 0.098432, + "grad_norm": 0.83984375, + "learning_rate": 9.088548387096774e-05, + "loss": 0.188, + "step": 6152 + }, + { + "epoch": 0.098448, + "grad_norm": 0.77734375, + "learning_rate": 9.088387096774193e-05, + "loss": 0.1734, + "step": 6153 + }, + { + "epoch": 0.098464, + "grad_norm": 0.734375, + "learning_rate": 9.088225806451613e-05, + "loss": 0.1695, + "step": 6154 + }, + { + "epoch": 0.09848, + "grad_norm": 0.6015625, + "learning_rate": 9.088064516129033e-05, + "loss": 0.1464, + "step": 6155 + }, + { + "epoch": 0.098496, + "grad_norm": 0.84765625, + "learning_rate": 9.087903225806453e-05, + "loss": 0.2051, + "step": 6156 + }, + { + "epoch": 0.098512, + "grad_norm": 0.95703125, + "learning_rate": 9.087741935483871e-05, + "loss": 0.2007, + "step": 6157 + }, + { + "epoch": 0.098528, + "grad_norm": 0.98828125, + "learning_rate": 9.087580645161291e-05, + "loss": 0.1726, + "step": 6158 + }, + { + "epoch": 0.098544, + "grad_norm": 0.83984375, + "learning_rate": 9.08741935483871e-05, + "loss": 0.2113, + "step": 6159 + }, + { + "epoch": 0.09856, + "grad_norm": 0.79296875, + "learning_rate": 9.08725806451613e-05, + "loss": 0.2117, + "step": 6160 + }, + { + "epoch": 0.098576, + "grad_norm": 0.9453125, + "learning_rate": 9.087096774193549e-05, + "loss": 0.1934, + "step": 6161 + }, + { + "epoch": 0.098592, + "grad_norm": 0.85546875, + "learning_rate": 9.086935483870968e-05, + "loss": 0.1648, + "step": 6162 + }, + { + "epoch": 0.098608, + "grad_norm": 0.56640625, + "learning_rate": 9.086774193548387e-05, + "loss": 0.1384, + "step": 6163 + }, + { + "epoch": 0.098624, + "grad_norm": 0.765625, + "learning_rate": 9.086612903225807e-05, + "loss": 0.174, + "step": 6164 + }, + { + "epoch": 0.09864, + "grad_norm": 0.83203125, + "learning_rate": 9.086451612903226e-05, + "loss": 0.1661, + "step": 6165 + }, + { + "epoch": 0.098656, + "grad_norm": 0.94140625, + "learning_rate": 9.086290322580646e-05, + "loss": 0.1556, + "step": 6166 + }, + { + "epoch": 0.098672, + "grad_norm": 0.875, + "learning_rate": 9.086129032258066e-05, + "loss": 0.1618, + "step": 6167 + }, + { + "epoch": 0.098688, + "grad_norm": 0.78515625, + "learning_rate": 9.085967741935484e-05, + "loss": 0.1963, + "step": 6168 + }, + { + "epoch": 0.098704, + "grad_norm": 0.7421875, + "learning_rate": 9.085806451612904e-05, + "loss": 0.1923, + "step": 6169 + }, + { + "epoch": 0.09872, + "grad_norm": 0.87109375, + "learning_rate": 9.085645161290323e-05, + "loss": 0.1844, + "step": 6170 + }, + { + "epoch": 0.098736, + "grad_norm": 0.84375, + "learning_rate": 9.085483870967743e-05, + "loss": 0.1889, + "step": 6171 + }, + { + "epoch": 0.098752, + "grad_norm": 0.80859375, + "learning_rate": 9.085322580645161e-05, + "loss": 0.1612, + "step": 6172 + }, + { + "epoch": 0.098768, + "grad_norm": 1.078125, + "learning_rate": 9.085161290322581e-05, + "loss": 0.1759, + "step": 6173 + }, + { + "epoch": 0.098784, + "grad_norm": 0.65625, + "learning_rate": 9.085e-05, + "loss": 0.1689, + "step": 6174 + }, + { + "epoch": 0.0988, + "grad_norm": 0.79296875, + "learning_rate": 9.08483870967742e-05, + "loss": 0.1803, + "step": 6175 + }, + { + "epoch": 0.098816, + "grad_norm": 0.8125, + "learning_rate": 9.084677419354838e-05, + "loss": 0.171, + "step": 6176 + }, + { + "epoch": 0.098832, + "grad_norm": 0.80859375, + "learning_rate": 9.084516129032258e-05, + "loss": 0.2228, + "step": 6177 + }, + { + "epoch": 0.098848, + "grad_norm": 1.6015625, + "learning_rate": 9.084354838709677e-05, + "loss": 0.2094, + "step": 6178 + }, + { + "epoch": 0.098864, + "grad_norm": 1.2109375, + "learning_rate": 9.084193548387097e-05, + "loss": 0.1753, + "step": 6179 + }, + { + "epoch": 0.09888, + "grad_norm": 0.6328125, + "learning_rate": 9.084032258064517e-05, + "loss": 0.1564, + "step": 6180 + }, + { + "epoch": 0.098896, + "grad_norm": 0.83203125, + "learning_rate": 9.083870967741937e-05, + "loss": 0.2001, + "step": 6181 + }, + { + "epoch": 0.098912, + "grad_norm": 0.6796875, + "learning_rate": 9.083709677419356e-05, + "loss": 0.1699, + "step": 6182 + }, + { + "epoch": 0.098928, + "grad_norm": 0.9921875, + "learning_rate": 9.083548387096774e-05, + "loss": 0.1869, + "step": 6183 + }, + { + "epoch": 0.098944, + "grad_norm": 0.67578125, + "learning_rate": 9.083387096774194e-05, + "loss": 0.1683, + "step": 6184 + }, + { + "epoch": 0.09896, + "grad_norm": 0.75, + "learning_rate": 9.083225806451613e-05, + "loss": 0.1948, + "step": 6185 + }, + { + "epoch": 0.098976, + "grad_norm": 0.69140625, + "learning_rate": 9.083064516129033e-05, + "loss": 0.16, + "step": 6186 + }, + { + "epoch": 0.098992, + "grad_norm": 0.828125, + "learning_rate": 9.082903225806451e-05, + "loss": 0.2032, + "step": 6187 + }, + { + "epoch": 0.099008, + "grad_norm": 0.53515625, + "learning_rate": 9.082741935483871e-05, + "loss": 0.1297, + "step": 6188 + }, + { + "epoch": 0.099024, + "grad_norm": 1.2421875, + "learning_rate": 9.08258064516129e-05, + "loss": 0.1733, + "step": 6189 + }, + { + "epoch": 0.09904, + "grad_norm": 1.515625, + "learning_rate": 9.08241935483871e-05, + "loss": 0.2269, + "step": 6190 + }, + { + "epoch": 0.099056, + "grad_norm": 1.4921875, + "learning_rate": 9.08225806451613e-05, + "loss": 0.1703, + "step": 6191 + }, + { + "epoch": 0.099072, + "grad_norm": 1.1640625, + "learning_rate": 9.08209677419355e-05, + "loss": 0.1964, + "step": 6192 + }, + { + "epoch": 0.099088, + "grad_norm": 1.0078125, + "learning_rate": 9.081935483870968e-05, + "loss": 0.2063, + "step": 6193 + }, + { + "epoch": 0.099104, + "grad_norm": 0.6171875, + "learning_rate": 9.081774193548388e-05, + "loss": 0.1812, + "step": 6194 + }, + { + "epoch": 0.09912, + "grad_norm": 0.62109375, + "learning_rate": 9.081612903225807e-05, + "loss": 0.1559, + "step": 6195 + }, + { + "epoch": 0.099136, + "grad_norm": 0.82421875, + "learning_rate": 9.081451612903227e-05, + "loss": 0.1654, + "step": 6196 + }, + { + "epoch": 0.099152, + "grad_norm": 0.953125, + "learning_rate": 9.081290322580646e-05, + "loss": 0.1909, + "step": 6197 + }, + { + "epoch": 0.099168, + "grad_norm": 1.3515625, + "learning_rate": 9.081129032258064e-05, + "loss": 0.2072, + "step": 6198 + }, + { + "epoch": 0.099184, + "grad_norm": 1.6953125, + "learning_rate": 9.080967741935484e-05, + "loss": 0.2134, + "step": 6199 + }, + { + "epoch": 0.0992, + "grad_norm": 1.0390625, + "learning_rate": 9.080806451612903e-05, + "loss": 0.1732, + "step": 6200 + }, + { + "epoch": 0.099216, + "grad_norm": 0.890625, + "learning_rate": 9.080645161290323e-05, + "loss": 0.2026, + "step": 6201 + }, + { + "epoch": 0.099232, + "grad_norm": 0.765625, + "learning_rate": 9.080483870967743e-05, + "loss": 0.1964, + "step": 6202 + }, + { + "epoch": 0.099248, + "grad_norm": 1.4609375, + "learning_rate": 9.080322580645163e-05, + "loss": 0.1556, + "step": 6203 + }, + { + "epoch": 0.099264, + "grad_norm": 0.84375, + "learning_rate": 9.080161290322581e-05, + "loss": 0.2284, + "step": 6204 + }, + { + "epoch": 0.09928, + "grad_norm": 1.109375, + "learning_rate": 9.080000000000001e-05, + "loss": 0.1626, + "step": 6205 + }, + { + "epoch": 0.099296, + "grad_norm": 0.71484375, + "learning_rate": 9.07983870967742e-05, + "loss": 0.1455, + "step": 6206 + }, + { + "epoch": 0.099312, + "grad_norm": 0.76953125, + "learning_rate": 9.07967741935484e-05, + "loss": 0.1974, + "step": 6207 + }, + { + "epoch": 0.099328, + "grad_norm": 1.6875, + "learning_rate": 9.079516129032258e-05, + "loss": 0.23, + "step": 6208 + }, + { + "epoch": 0.099344, + "grad_norm": 0.984375, + "learning_rate": 9.079354838709678e-05, + "loss": 0.1695, + "step": 6209 + }, + { + "epoch": 0.09936, + "grad_norm": 0.5546875, + "learning_rate": 9.079193548387097e-05, + "loss": 0.1409, + "step": 6210 + }, + { + "epoch": 0.099376, + "grad_norm": 1.265625, + "learning_rate": 9.079032258064517e-05, + "loss": 0.1986, + "step": 6211 + }, + { + "epoch": 0.099392, + "grad_norm": 0.734375, + "learning_rate": 9.078870967741935e-05, + "loss": 0.2219, + "step": 6212 + }, + { + "epoch": 0.099408, + "grad_norm": 0.54296875, + "learning_rate": 9.078709677419354e-05, + "loss": 0.1603, + "step": 6213 + }, + { + "epoch": 0.099424, + "grad_norm": 0.671875, + "learning_rate": 9.078548387096774e-05, + "loss": 0.1777, + "step": 6214 + }, + { + "epoch": 0.09944, + "grad_norm": 1.2265625, + "learning_rate": 9.078387096774194e-05, + "loss": 0.2025, + "step": 6215 + }, + { + "epoch": 0.099456, + "grad_norm": 1.265625, + "learning_rate": 9.078225806451614e-05, + "loss": 0.1834, + "step": 6216 + }, + { + "epoch": 0.099472, + "grad_norm": 0.84375, + "learning_rate": 9.078064516129033e-05, + "loss": 0.1881, + "step": 6217 + }, + { + "epoch": 0.099488, + "grad_norm": 0.6953125, + "learning_rate": 9.077903225806453e-05, + "loss": 0.2313, + "step": 6218 + }, + { + "epoch": 0.099504, + "grad_norm": 0.625, + "learning_rate": 9.077741935483871e-05, + "loss": 0.1715, + "step": 6219 + }, + { + "epoch": 0.09952, + "grad_norm": 0.84375, + "learning_rate": 9.077580645161291e-05, + "loss": 0.1843, + "step": 6220 + }, + { + "epoch": 0.099536, + "grad_norm": 0.609375, + "learning_rate": 9.07741935483871e-05, + "loss": 0.1756, + "step": 6221 + }, + { + "epoch": 0.099552, + "grad_norm": 0.80078125, + "learning_rate": 9.07725806451613e-05, + "loss": 0.2074, + "step": 6222 + }, + { + "epoch": 0.099568, + "grad_norm": 0.76171875, + "learning_rate": 9.077096774193548e-05, + "loss": 0.1632, + "step": 6223 + }, + { + "epoch": 0.099584, + "grad_norm": 0.6171875, + "learning_rate": 9.076935483870968e-05, + "loss": 0.1843, + "step": 6224 + }, + { + "epoch": 0.0996, + "grad_norm": 0.9921875, + "learning_rate": 9.076774193548387e-05, + "loss": 0.1902, + "step": 6225 + }, + { + "epoch": 0.099616, + "grad_norm": 0.6015625, + "learning_rate": 9.076612903225807e-05, + "loss": 0.1623, + "step": 6226 + }, + { + "epoch": 0.099632, + "grad_norm": 0.828125, + "learning_rate": 9.076451612903227e-05, + "loss": 0.1925, + "step": 6227 + }, + { + "epoch": 0.099648, + "grad_norm": 1.1171875, + "learning_rate": 9.076290322580645e-05, + "loss": 0.2036, + "step": 6228 + }, + { + "epoch": 0.099664, + "grad_norm": 0.7578125, + "learning_rate": 9.076129032258065e-05, + "loss": 0.1802, + "step": 6229 + }, + { + "epoch": 0.09968, + "grad_norm": 0.71484375, + "learning_rate": 9.075967741935484e-05, + "loss": 0.1944, + "step": 6230 + }, + { + "epoch": 0.099696, + "grad_norm": 0.80859375, + "learning_rate": 9.075806451612904e-05, + "loss": 0.1598, + "step": 6231 + }, + { + "epoch": 0.099712, + "grad_norm": 1.1640625, + "learning_rate": 9.075645161290323e-05, + "loss": 0.1872, + "step": 6232 + }, + { + "epoch": 0.099728, + "grad_norm": 0.95703125, + "learning_rate": 9.075483870967743e-05, + "loss": 0.1816, + "step": 6233 + }, + { + "epoch": 0.099744, + "grad_norm": 0.890625, + "learning_rate": 9.075322580645161e-05, + "loss": 0.1819, + "step": 6234 + }, + { + "epoch": 0.09976, + "grad_norm": 0.9140625, + "learning_rate": 9.075161290322581e-05, + "loss": 0.1913, + "step": 6235 + }, + { + "epoch": 0.099776, + "grad_norm": 0.84375, + "learning_rate": 9.075e-05, + "loss": 0.1747, + "step": 6236 + }, + { + "epoch": 0.099792, + "grad_norm": 0.71484375, + "learning_rate": 9.07483870967742e-05, + "loss": 0.1854, + "step": 6237 + }, + { + "epoch": 0.099808, + "grad_norm": 0.60546875, + "learning_rate": 9.07467741935484e-05, + "loss": 0.139, + "step": 6238 + }, + { + "epoch": 0.099824, + "grad_norm": 0.98046875, + "learning_rate": 9.07451612903226e-05, + "loss": 0.159, + "step": 6239 + }, + { + "epoch": 0.09984, + "grad_norm": 0.85546875, + "learning_rate": 9.074354838709678e-05, + "loss": 0.1879, + "step": 6240 + }, + { + "epoch": 0.099856, + "grad_norm": 0.98046875, + "learning_rate": 9.074193548387098e-05, + "loss": 0.1742, + "step": 6241 + }, + { + "epoch": 0.099872, + "grad_norm": 0.64453125, + "learning_rate": 9.074032258064517e-05, + "loss": 0.1423, + "step": 6242 + }, + { + "epoch": 0.099888, + "grad_norm": 1.3046875, + "learning_rate": 9.073870967741937e-05, + "loss": 0.1764, + "step": 6243 + }, + { + "epoch": 0.099904, + "grad_norm": 1.515625, + "learning_rate": 9.073709677419355e-05, + "loss": 0.1934, + "step": 6244 + }, + { + "epoch": 0.09992, + "grad_norm": 0.73828125, + "learning_rate": 9.073548387096774e-05, + "loss": 0.1502, + "step": 6245 + }, + { + "epoch": 0.099936, + "grad_norm": 0.78515625, + "learning_rate": 9.073387096774194e-05, + "loss": 0.1977, + "step": 6246 + }, + { + "epoch": 0.099952, + "grad_norm": 0.9140625, + "learning_rate": 9.073225806451612e-05, + "loss": 0.1972, + "step": 6247 + }, + { + "epoch": 0.099968, + "grad_norm": 1.0078125, + "learning_rate": 9.073064516129032e-05, + "loss": 0.1658, + "step": 6248 + }, + { + "epoch": 0.099984, + "grad_norm": 0.6171875, + "learning_rate": 9.072903225806451e-05, + "loss": 0.1918, + "step": 6249 + }, + { + "epoch": 0.1, + "grad_norm": 0.67578125, + "learning_rate": 9.072741935483871e-05, + "loss": 0.1581, + "step": 6250 + }, + { + "epoch": 0.100016, + "grad_norm": 0.9609375, + "learning_rate": 9.072580645161291e-05, + "loss": 0.1839, + "step": 6251 + }, + { + "epoch": 0.100032, + "grad_norm": 0.8203125, + "learning_rate": 9.072419354838711e-05, + "loss": 0.1846, + "step": 6252 + }, + { + "epoch": 0.100048, + "grad_norm": 0.7265625, + "learning_rate": 9.07225806451613e-05, + "loss": 0.1594, + "step": 6253 + }, + { + "epoch": 0.100064, + "grad_norm": 0.89453125, + "learning_rate": 9.07209677419355e-05, + "loss": 0.1949, + "step": 6254 + }, + { + "epoch": 0.10008, + "grad_norm": 0.83203125, + "learning_rate": 9.071935483870968e-05, + "loss": 0.1733, + "step": 6255 + }, + { + "epoch": 0.100096, + "grad_norm": 0.87109375, + "learning_rate": 9.071774193548388e-05, + "loss": 0.2304, + "step": 6256 + }, + { + "epoch": 0.100112, + "grad_norm": 0.72265625, + "learning_rate": 9.071612903225807e-05, + "loss": 0.1746, + "step": 6257 + }, + { + "epoch": 0.100128, + "grad_norm": 0.9609375, + "learning_rate": 9.071451612903227e-05, + "loss": 0.2019, + "step": 6258 + }, + { + "epoch": 0.100144, + "grad_norm": 0.8125, + "learning_rate": 9.071290322580645e-05, + "loss": 0.1453, + "step": 6259 + }, + { + "epoch": 0.10016, + "grad_norm": 1.09375, + "learning_rate": 9.071129032258064e-05, + "loss": 0.1726, + "step": 6260 + }, + { + "epoch": 0.100176, + "grad_norm": 0.8359375, + "learning_rate": 9.070967741935484e-05, + "loss": 0.1891, + "step": 6261 + }, + { + "epoch": 0.100192, + "grad_norm": 0.83984375, + "learning_rate": 9.070806451612904e-05, + "loss": 0.1929, + "step": 6262 + }, + { + "epoch": 0.100208, + "grad_norm": 1.65625, + "learning_rate": 9.070645161290324e-05, + "loss": 0.1835, + "step": 6263 + }, + { + "epoch": 0.100224, + "grad_norm": 1.2421875, + "learning_rate": 9.070483870967742e-05, + "loss": 0.1738, + "step": 6264 + }, + { + "epoch": 0.10024, + "grad_norm": 1.125, + "learning_rate": 9.070322580645162e-05, + "loss": 0.177, + "step": 6265 + }, + { + "epoch": 0.100256, + "grad_norm": 0.64453125, + "learning_rate": 9.070161290322581e-05, + "loss": 0.1537, + "step": 6266 + }, + { + "epoch": 0.100272, + "grad_norm": 0.828125, + "learning_rate": 9.070000000000001e-05, + "loss": 0.1847, + "step": 6267 + }, + { + "epoch": 0.100288, + "grad_norm": 0.828125, + "learning_rate": 9.06983870967742e-05, + "loss": 0.209, + "step": 6268 + }, + { + "epoch": 0.100304, + "grad_norm": 0.55859375, + "learning_rate": 9.06967741935484e-05, + "loss": 0.1296, + "step": 6269 + }, + { + "epoch": 0.10032, + "grad_norm": 0.67578125, + "learning_rate": 9.069516129032258e-05, + "loss": 0.1725, + "step": 6270 + }, + { + "epoch": 0.100336, + "grad_norm": 0.5859375, + "learning_rate": 9.069354838709678e-05, + "loss": 0.1351, + "step": 6271 + }, + { + "epoch": 0.100352, + "grad_norm": 1.28125, + "learning_rate": 9.069193548387097e-05, + "loss": 0.1653, + "step": 6272 + }, + { + "epoch": 0.100368, + "grad_norm": 1.8671875, + "learning_rate": 9.069032258064517e-05, + "loss": 0.1844, + "step": 6273 + }, + { + "epoch": 0.100384, + "grad_norm": 1.6640625, + "learning_rate": 9.068870967741935e-05, + "loss": 0.1696, + "step": 6274 + }, + { + "epoch": 0.1004, + "grad_norm": 1.9609375, + "learning_rate": 9.068709677419355e-05, + "loss": 0.1939, + "step": 6275 + }, + { + "epoch": 0.100416, + "grad_norm": 0.68359375, + "learning_rate": 9.068548387096775e-05, + "loss": 0.1934, + "step": 6276 + }, + { + "epoch": 0.100432, + "grad_norm": 1.4921875, + "learning_rate": 9.068387096774194e-05, + "loss": 0.212, + "step": 6277 + }, + { + "epoch": 0.100448, + "grad_norm": 1.046875, + "learning_rate": 9.068225806451614e-05, + "loss": 0.136, + "step": 6278 + }, + { + "epoch": 0.100464, + "grad_norm": 1.0390625, + "learning_rate": 9.068064516129032e-05, + "loss": 0.1982, + "step": 6279 + }, + { + "epoch": 0.10048, + "grad_norm": 0.609375, + "learning_rate": 9.067903225806452e-05, + "loss": 0.1383, + "step": 6280 + }, + { + "epoch": 0.100496, + "grad_norm": 0.9609375, + "learning_rate": 9.067741935483871e-05, + "loss": 0.2333, + "step": 6281 + }, + { + "epoch": 0.100512, + "grad_norm": 0.57421875, + "learning_rate": 9.067580645161291e-05, + "loss": 0.1743, + "step": 6282 + }, + { + "epoch": 0.100528, + "grad_norm": 0.8828125, + "learning_rate": 9.06741935483871e-05, + "loss": 0.189, + "step": 6283 + }, + { + "epoch": 0.100544, + "grad_norm": 0.91015625, + "learning_rate": 9.06725806451613e-05, + "loss": 0.1685, + "step": 6284 + }, + { + "epoch": 0.10056, + "grad_norm": 1.28125, + "learning_rate": 9.067096774193548e-05, + "loss": 0.1837, + "step": 6285 + }, + { + "epoch": 0.100576, + "grad_norm": 0.609375, + "learning_rate": 9.066935483870968e-05, + "loss": 0.1543, + "step": 6286 + }, + { + "epoch": 0.100592, + "grad_norm": 0.92578125, + "learning_rate": 9.066774193548388e-05, + "loss": 0.2155, + "step": 6287 + }, + { + "epoch": 0.100608, + "grad_norm": 1.203125, + "learning_rate": 9.066612903225808e-05, + "loss": 0.2497, + "step": 6288 + }, + { + "epoch": 0.100624, + "grad_norm": 0.76953125, + "learning_rate": 9.066451612903227e-05, + "loss": 0.1523, + "step": 6289 + }, + { + "epoch": 0.10064, + "grad_norm": 0.7890625, + "learning_rate": 9.066290322580647e-05, + "loss": 0.2225, + "step": 6290 + }, + { + "epoch": 0.100656, + "grad_norm": 0.9765625, + "learning_rate": 9.066129032258065e-05, + "loss": 0.1445, + "step": 6291 + }, + { + "epoch": 0.100672, + "grad_norm": 1.125, + "learning_rate": 9.065967741935484e-05, + "loss": 0.1434, + "step": 6292 + }, + { + "epoch": 0.100688, + "grad_norm": 0.7109375, + "learning_rate": 9.065806451612904e-05, + "loss": 0.1761, + "step": 6293 + }, + { + "epoch": 0.100704, + "grad_norm": 0.91796875, + "learning_rate": 9.065645161290322e-05, + "loss": 0.1987, + "step": 6294 + }, + { + "epoch": 0.10072, + "grad_norm": 0.7734375, + "learning_rate": 9.065483870967742e-05, + "loss": 0.2157, + "step": 6295 + }, + { + "epoch": 0.100736, + "grad_norm": 0.64453125, + "learning_rate": 9.065322580645161e-05, + "loss": 0.1783, + "step": 6296 + }, + { + "epoch": 0.100752, + "grad_norm": 1.3125, + "learning_rate": 9.065161290322581e-05, + "loss": 0.1638, + "step": 6297 + }, + { + "epoch": 0.100768, + "grad_norm": 0.80078125, + "learning_rate": 9.065000000000001e-05, + "loss": 0.1512, + "step": 6298 + }, + { + "epoch": 0.100784, + "grad_norm": 1.3984375, + "learning_rate": 9.064838709677421e-05, + "loss": 0.1839, + "step": 6299 + }, + { + "epoch": 0.1008, + "grad_norm": 0.84765625, + "learning_rate": 9.06467741935484e-05, + "loss": 0.1448, + "step": 6300 + }, + { + "epoch": 0.100816, + "grad_norm": 0.93359375, + "learning_rate": 9.06451612903226e-05, + "loss": 0.2027, + "step": 6301 + }, + { + "epoch": 0.100832, + "grad_norm": 0.76171875, + "learning_rate": 9.064354838709678e-05, + "loss": 0.2052, + "step": 6302 + }, + { + "epoch": 0.100848, + "grad_norm": 0.61328125, + "learning_rate": 9.064193548387098e-05, + "loss": 0.1691, + "step": 6303 + }, + { + "epoch": 0.100864, + "grad_norm": 0.859375, + "learning_rate": 9.064032258064517e-05, + "loss": 0.1948, + "step": 6304 + }, + { + "epoch": 0.10088, + "grad_norm": 0.921875, + "learning_rate": 9.063870967741936e-05, + "loss": 0.1578, + "step": 6305 + }, + { + "epoch": 0.100896, + "grad_norm": 0.953125, + "learning_rate": 9.063709677419355e-05, + "loss": 0.1824, + "step": 6306 + }, + { + "epoch": 0.100912, + "grad_norm": 0.99609375, + "learning_rate": 9.063548387096774e-05, + "loss": 0.1575, + "step": 6307 + }, + { + "epoch": 0.100928, + "grad_norm": 0.69921875, + "learning_rate": 9.063387096774194e-05, + "loss": 0.1938, + "step": 6308 + }, + { + "epoch": 0.100944, + "grad_norm": 0.78125, + "learning_rate": 9.063225806451612e-05, + "loss": 0.1888, + "step": 6309 + }, + { + "epoch": 0.10096, + "grad_norm": 1.3671875, + "learning_rate": 9.063064516129032e-05, + "loss": 0.1939, + "step": 6310 + }, + { + "epoch": 0.100976, + "grad_norm": 1.0625, + "learning_rate": 9.062903225806452e-05, + "loss": 0.1642, + "step": 6311 + }, + { + "epoch": 0.100992, + "grad_norm": 0.6796875, + "learning_rate": 9.062741935483872e-05, + "loss": 0.1616, + "step": 6312 + }, + { + "epoch": 0.101008, + "grad_norm": 0.83203125, + "learning_rate": 9.062580645161291e-05, + "loss": 0.1599, + "step": 6313 + }, + { + "epoch": 0.101024, + "grad_norm": 0.59375, + "learning_rate": 9.062419354838711e-05, + "loss": 0.1608, + "step": 6314 + }, + { + "epoch": 0.10104, + "grad_norm": 0.54296875, + "learning_rate": 9.062258064516129e-05, + "loss": 0.1365, + "step": 6315 + }, + { + "epoch": 0.101056, + "grad_norm": 0.76953125, + "learning_rate": 9.062096774193549e-05, + "loss": 0.1824, + "step": 6316 + }, + { + "epoch": 0.101072, + "grad_norm": 0.6640625, + "learning_rate": 9.061935483870968e-05, + "loss": 0.168, + "step": 6317 + }, + { + "epoch": 0.101088, + "grad_norm": 0.81640625, + "learning_rate": 9.061774193548388e-05, + "loss": 0.1781, + "step": 6318 + }, + { + "epoch": 0.101104, + "grad_norm": 1.171875, + "learning_rate": 9.061612903225806e-05, + "loss": 0.1413, + "step": 6319 + }, + { + "epoch": 0.10112, + "grad_norm": 0.90234375, + "learning_rate": 9.061451612903226e-05, + "loss": 0.1932, + "step": 6320 + }, + { + "epoch": 0.101136, + "grad_norm": 0.58203125, + "learning_rate": 9.061290322580645e-05, + "loss": 0.1826, + "step": 6321 + }, + { + "epoch": 0.101152, + "grad_norm": 1.2734375, + "learning_rate": 9.061129032258065e-05, + "loss": 0.1946, + "step": 6322 + }, + { + "epoch": 0.101168, + "grad_norm": 0.8515625, + "learning_rate": 9.060967741935485e-05, + "loss": 0.1808, + "step": 6323 + }, + { + "epoch": 0.101184, + "grad_norm": 0.51953125, + "learning_rate": 9.060806451612904e-05, + "loss": 0.1824, + "step": 6324 + }, + { + "epoch": 0.1012, + "grad_norm": 0.64453125, + "learning_rate": 9.060645161290324e-05, + "loss": 0.1665, + "step": 6325 + }, + { + "epoch": 0.101216, + "grad_norm": 0.703125, + "learning_rate": 9.060483870967742e-05, + "loss": 0.1782, + "step": 6326 + }, + { + "epoch": 0.101232, + "grad_norm": 0.95703125, + "learning_rate": 9.060322580645162e-05, + "loss": 0.1994, + "step": 6327 + }, + { + "epoch": 0.101248, + "grad_norm": 0.63671875, + "learning_rate": 9.060161290322581e-05, + "loss": 0.1995, + "step": 6328 + }, + { + "epoch": 0.101264, + "grad_norm": 0.7890625, + "learning_rate": 9.06e-05, + "loss": 0.1513, + "step": 6329 + }, + { + "epoch": 0.10128, + "grad_norm": 1.0, + "learning_rate": 9.059838709677419e-05, + "loss": 0.2057, + "step": 6330 + }, + { + "epoch": 0.101296, + "grad_norm": 0.71484375, + "learning_rate": 9.059677419354839e-05, + "loss": 0.1905, + "step": 6331 + }, + { + "epoch": 0.101312, + "grad_norm": 0.68359375, + "learning_rate": 9.059516129032258e-05, + "loss": 0.1907, + "step": 6332 + }, + { + "epoch": 0.101328, + "grad_norm": 0.98046875, + "learning_rate": 9.059354838709678e-05, + "loss": 0.1924, + "step": 6333 + }, + { + "epoch": 0.101344, + "grad_norm": 0.7890625, + "learning_rate": 9.059193548387098e-05, + "loss": 0.2158, + "step": 6334 + }, + { + "epoch": 0.10136, + "grad_norm": 0.85546875, + "learning_rate": 9.059032258064516e-05, + "loss": 0.1902, + "step": 6335 + }, + { + "epoch": 0.101376, + "grad_norm": 0.69921875, + "learning_rate": 9.058870967741936e-05, + "loss": 0.1864, + "step": 6336 + }, + { + "epoch": 0.101392, + "grad_norm": 1.0234375, + "learning_rate": 9.058709677419355e-05, + "loss": 0.185, + "step": 6337 + }, + { + "epoch": 0.101408, + "grad_norm": 0.68359375, + "learning_rate": 9.058548387096775e-05, + "loss": 0.1484, + "step": 6338 + }, + { + "epoch": 0.101424, + "grad_norm": 0.68359375, + "learning_rate": 9.058387096774194e-05, + "loss": 0.1821, + "step": 6339 + }, + { + "epoch": 0.10144, + "grad_norm": 1.078125, + "learning_rate": 9.058225806451613e-05, + "loss": 0.1951, + "step": 6340 + }, + { + "epoch": 0.101456, + "grad_norm": 0.6015625, + "learning_rate": 9.058064516129032e-05, + "loss": 0.1399, + "step": 6341 + }, + { + "epoch": 0.101472, + "grad_norm": 0.8203125, + "learning_rate": 9.057903225806452e-05, + "loss": 0.1821, + "step": 6342 + }, + { + "epoch": 0.101488, + "grad_norm": 0.62109375, + "learning_rate": 9.05774193548387e-05, + "loss": 0.1824, + "step": 6343 + }, + { + "epoch": 0.101504, + "grad_norm": 0.80078125, + "learning_rate": 9.05758064516129e-05, + "loss": 0.1884, + "step": 6344 + }, + { + "epoch": 0.10152, + "grad_norm": 1.0703125, + "learning_rate": 9.057419354838709e-05, + "loss": 0.1915, + "step": 6345 + }, + { + "epoch": 0.101536, + "grad_norm": 0.70703125, + "learning_rate": 9.057258064516129e-05, + "loss": 0.1685, + "step": 6346 + }, + { + "epoch": 0.101552, + "grad_norm": 1.0625, + "learning_rate": 9.057096774193549e-05, + "loss": 0.2102, + "step": 6347 + }, + { + "epoch": 0.101568, + "grad_norm": 0.7265625, + "learning_rate": 9.056935483870969e-05, + "loss": 0.143, + "step": 6348 + }, + { + "epoch": 0.101584, + "grad_norm": 1.1171875, + "learning_rate": 9.056774193548388e-05, + "loss": 0.1767, + "step": 6349 + }, + { + "epoch": 0.1016, + "grad_norm": 0.5234375, + "learning_rate": 9.056612903225808e-05, + "loss": 0.1728, + "step": 6350 + }, + { + "epoch": 0.101616, + "grad_norm": 0.78125, + "learning_rate": 9.056451612903226e-05, + "loss": 0.1785, + "step": 6351 + }, + { + "epoch": 0.101632, + "grad_norm": 1.203125, + "learning_rate": 9.056290322580646e-05, + "loss": 0.1591, + "step": 6352 + }, + { + "epoch": 0.101648, + "grad_norm": 1.515625, + "learning_rate": 9.056129032258065e-05, + "loss": 0.1548, + "step": 6353 + }, + { + "epoch": 0.101664, + "grad_norm": 0.8671875, + "learning_rate": 9.055967741935483e-05, + "loss": 0.2116, + "step": 6354 + }, + { + "epoch": 0.10168, + "grad_norm": 0.87890625, + "learning_rate": 9.055806451612903e-05, + "loss": 0.1938, + "step": 6355 + }, + { + "epoch": 0.101696, + "grad_norm": 0.703125, + "learning_rate": 9.055645161290322e-05, + "loss": 0.1948, + "step": 6356 + }, + { + "epoch": 0.101712, + "grad_norm": 0.84765625, + "learning_rate": 9.055483870967742e-05, + "loss": 0.1352, + "step": 6357 + }, + { + "epoch": 0.101728, + "grad_norm": 0.65234375, + "learning_rate": 9.055322580645162e-05, + "loss": 0.1848, + "step": 6358 + }, + { + "epoch": 0.101744, + "grad_norm": 0.765625, + "learning_rate": 9.055161290322582e-05, + "loss": 0.1896, + "step": 6359 + }, + { + "epoch": 0.10176, + "grad_norm": 0.828125, + "learning_rate": 9.055e-05, + "loss": 0.1826, + "step": 6360 + }, + { + "epoch": 0.101776, + "grad_norm": 0.671875, + "learning_rate": 9.05483870967742e-05, + "loss": 0.1637, + "step": 6361 + }, + { + "epoch": 0.101792, + "grad_norm": 1.015625, + "learning_rate": 9.054677419354839e-05, + "loss": 0.1748, + "step": 6362 + }, + { + "epoch": 0.101808, + "grad_norm": 1.265625, + "learning_rate": 9.054516129032259e-05, + "loss": 0.1377, + "step": 6363 + }, + { + "epoch": 0.101824, + "grad_norm": 1.015625, + "learning_rate": 9.054354838709678e-05, + "loss": 0.2019, + "step": 6364 + }, + { + "epoch": 0.10184, + "grad_norm": 0.85546875, + "learning_rate": 9.054193548387098e-05, + "loss": 0.2056, + "step": 6365 + }, + { + "epoch": 0.101856, + "grad_norm": 1.359375, + "learning_rate": 9.054032258064516e-05, + "loss": 0.1835, + "step": 6366 + }, + { + "epoch": 0.101872, + "grad_norm": 0.60546875, + "learning_rate": 9.053870967741936e-05, + "loss": 0.1687, + "step": 6367 + }, + { + "epoch": 0.101888, + "grad_norm": 0.59765625, + "learning_rate": 9.053709677419355e-05, + "loss": 0.166, + "step": 6368 + }, + { + "epoch": 0.101904, + "grad_norm": 0.98046875, + "learning_rate": 9.053548387096773e-05, + "loss": 0.1855, + "step": 6369 + }, + { + "epoch": 0.10192, + "grad_norm": 1.4375, + "learning_rate": 9.053387096774193e-05, + "loss": 0.1609, + "step": 6370 + }, + { + "epoch": 0.101936, + "grad_norm": 0.703125, + "learning_rate": 9.053225806451613e-05, + "loss": 0.1632, + "step": 6371 + }, + { + "epoch": 0.101952, + "grad_norm": 0.7265625, + "learning_rate": 9.053064516129033e-05, + "loss": 0.1947, + "step": 6372 + }, + { + "epoch": 0.101968, + "grad_norm": 0.8984375, + "learning_rate": 9.052903225806452e-05, + "loss": 0.1993, + "step": 6373 + }, + { + "epoch": 0.101984, + "grad_norm": 0.765625, + "learning_rate": 9.052741935483872e-05, + "loss": 0.1802, + "step": 6374 + }, + { + "epoch": 0.102, + "grad_norm": 0.6953125, + "learning_rate": 9.05258064516129e-05, + "loss": 0.1821, + "step": 6375 + }, + { + "epoch": 0.102016, + "grad_norm": 0.5703125, + "learning_rate": 9.05241935483871e-05, + "loss": 0.1335, + "step": 6376 + }, + { + "epoch": 0.102032, + "grad_norm": 0.55078125, + "learning_rate": 9.052258064516129e-05, + "loss": 0.1276, + "step": 6377 + }, + { + "epoch": 0.102048, + "grad_norm": 1.0078125, + "learning_rate": 9.052096774193549e-05, + "loss": 0.2414, + "step": 6378 + }, + { + "epoch": 0.102064, + "grad_norm": 1.0703125, + "learning_rate": 9.051935483870968e-05, + "loss": 0.1538, + "step": 6379 + }, + { + "epoch": 0.10208, + "grad_norm": 0.6640625, + "learning_rate": 9.051774193548388e-05, + "loss": 0.1903, + "step": 6380 + }, + { + "epoch": 0.102096, + "grad_norm": 0.84765625, + "learning_rate": 9.051612903225806e-05, + "loss": 0.1834, + "step": 6381 + }, + { + "epoch": 0.102112, + "grad_norm": 0.54296875, + "learning_rate": 9.051451612903226e-05, + "loss": 0.1636, + "step": 6382 + }, + { + "epoch": 0.102128, + "grad_norm": 0.67578125, + "learning_rate": 9.051290322580646e-05, + "loss": 0.1774, + "step": 6383 + }, + { + "epoch": 0.102144, + "grad_norm": 0.8125, + "learning_rate": 9.051129032258065e-05, + "loss": 0.1815, + "step": 6384 + }, + { + "epoch": 0.10216, + "grad_norm": 0.64453125, + "learning_rate": 9.050967741935485e-05, + "loss": 0.1611, + "step": 6385 + }, + { + "epoch": 0.102176, + "grad_norm": 1.4140625, + "learning_rate": 9.050806451612903e-05, + "loss": 0.1757, + "step": 6386 + }, + { + "epoch": 0.102192, + "grad_norm": 0.59375, + "learning_rate": 9.050645161290323e-05, + "loss": 0.1644, + "step": 6387 + }, + { + "epoch": 0.102208, + "grad_norm": 1.1015625, + "learning_rate": 9.050483870967742e-05, + "loss": 0.1509, + "step": 6388 + }, + { + "epoch": 0.102224, + "grad_norm": 0.67578125, + "learning_rate": 9.050322580645162e-05, + "loss": 0.1438, + "step": 6389 + }, + { + "epoch": 0.10224, + "grad_norm": 0.953125, + "learning_rate": 9.05016129032258e-05, + "loss": 0.2145, + "step": 6390 + }, + { + "epoch": 0.102256, + "grad_norm": 1.1171875, + "learning_rate": 9.05e-05, + "loss": 0.1704, + "step": 6391 + }, + { + "epoch": 0.102272, + "grad_norm": 1.0859375, + "learning_rate": 9.049838709677419e-05, + "loss": 0.1749, + "step": 6392 + }, + { + "epoch": 0.102288, + "grad_norm": 1.7109375, + "learning_rate": 9.049677419354839e-05, + "loss": 0.174, + "step": 6393 + }, + { + "epoch": 0.102304, + "grad_norm": 1.578125, + "learning_rate": 9.049516129032259e-05, + "loss": 0.1701, + "step": 6394 + }, + { + "epoch": 0.10232, + "grad_norm": 0.77734375, + "learning_rate": 9.049354838709679e-05, + "loss": 0.2119, + "step": 6395 + }, + { + "epoch": 0.102336, + "grad_norm": 0.72265625, + "learning_rate": 9.049193548387098e-05, + "loss": 0.2086, + "step": 6396 + }, + { + "epoch": 0.102352, + "grad_norm": 1.03125, + "learning_rate": 9.049032258064517e-05, + "loss": 0.1744, + "step": 6397 + }, + { + "epoch": 0.102368, + "grad_norm": 0.9765625, + "learning_rate": 9.048870967741936e-05, + "loss": 0.1748, + "step": 6398 + }, + { + "epoch": 0.102384, + "grad_norm": 0.71875, + "learning_rate": 9.048709677419356e-05, + "loss": 0.1557, + "step": 6399 + }, + { + "epoch": 0.1024, + "grad_norm": 0.92578125, + "learning_rate": 9.048548387096775e-05, + "loss": 0.1864, + "step": 6400 + }, + { + "epoch": 0.102416, + "grad_norm": 0.8359375, + "learning_rate": 9.048387096774193e-05, + "loss": 0.1745, + "step": 6401 + }, + { + "epoch": 0.102432, + "grad_norm": 0.66015625, + "learning_rate": 9.048225806451613e-05, + "loss": 0.1616, + "step": 6402 + }, + { + "epoch": 0.102448, + "grad_norm": 0.7890625, + "learning_rate": 9.048064516129032e-05, + "loss": 0.2272, + "step": 6403 + }, + { + "epoch": 0.102464, + "grad_norm": 1.109375, + "learning_rate": 9.047903225806452e-05, + "loss": 0.1825, + "step": 6404 + }, + { + "epoch": 0.10248, + "grad_norm": 0.8515625, + "learning_rate": 9.04774193548387e-05, + "loss": 0.1875, + "step": 6405 + }, + { + "epoch": 0.102496, + "grad_norm": 0.98046875, + "learning_rate": 9.04758064516129e-05, + "loss": 0.1871, + "step": 6406 + }, + { + "epoch": 0.102512, + "grad_norm": 0.5703125, + "learning_rate": 9.04741935483871e-05, + "loss": 0.1337, + "step": 6407 + }, + { + "epoch": 0.102528, + "grad_norm": 0.83203125, + "learning_rate": 9.04725806451613e-05, + "loss": 0.2119, + "step": 6408 + }, + { + "epoch": 0.102544, + "grad_norm": 1.3828125, + "learning_rate": 9.047096774193549e-05, + "loss": 0.1951, + "step": 6409 + }, + { + "epoch": 0.10256, + "grad_norm": 0.51953125, + "learning_rate": 9.046935483870969e-05, + "loss": 0.1504, + "step": 6410 + }, + { + "epoch": 0.102576, + "grad_norm": 0.52734375, + "learning_rate": 9.046774193548387e-05, + "loss": 0.1819, + "step": 6411 + }, + { + "epoch": 0.102592, + "grad_norm": 1.21875, + "learning_rate": 9.046612903225807e-05, + "loss": 0.1489, + "step": 6412 + }, + { + "epoch": 0.102608, + "grad_norm": 1.078125, + "learning_rate": 9.046451612903226e-05, + "loss": 0.1895, + "step": 6413 + }, + { + "epoch": 0.102624, + "grad_norm": 0.453125, + "learning_rate": 9.046290322580646e-05, + "loss": 0.1628, + "step": 6414 + }, + { + "epoch": 0.10264, + "grad_norm": 0.58984375, + "learning_rate": 9.046129032258065e-05, + "loss": 0.1272, + "step": 6415 + }, + { + "epoch": 0.102656, + "grad_norm": 1.0, + "learning_rate": 9.045967741935483e-05, + "loss": 0.1563, + "step": 6416 + }, + { + "epoch": 0.102672, + "grad_norm": 0.76953125, + "learning_rate": 9.045806451612903e-05, + "loss": 0.1779, + "step": 6417 + }, + { + "epoch": 0.102688, + "grad_norm": 1.109375, + "learning_rate": 9.045645161290323e-05, + "loss": 0.2201, + "step": 6418 + }, + { + "epoch": 0.102704, + "grad_norm": 0.6953125, + "learning_rate": 9.045483870967743e-05, + "loss": 0.1605, + "step": 6419 + }, + { + "epoch": 0.10272, + "grad_norm": 0.66796875, + "learning_rate": 9.045322580645162e-05, + "loss": 0.1872, + "step": 6420 + }, + { + "epoch": 0.102736, + "grad_norm": 1.1640625, + "learning_rate": 9.045161290322582e-05, + "loss": 0.2092, + "step": 6421 + }, + { + "epoch": 0.102752, + "grad_norm": 0.6328125, + "learning_rate": 9.045e-05, + "loss": 0.1628, + "step": 6422 + }, + { + "epoch": 0.102768, + "grad_norm": 0.82421875, + "learning_rate": 9.04483870967742e-05, + "loss": 0.1638, + "step": 6423 + }, + { + "epoch": 0.102784, + "grad_norm": 2.078125, + "learning_rate": 9.044677419354839e-05, + "loss": 0.2271, + "step": 6424 + }, + { + "epoch": 0.1028, + "grad_norm": 1.5, + "learning_rate": 9.044516129032259e-05, + "loss": 0.1994, + "step": 6425 + }, + { + "epoch": 0.102816, + "grad_norm": 0.7578125, + "learning_rate": 9.044354838709677e-05, + "loss": 0.1299, + "step": 6426 + }, + { + "epoch": 0.102832, + "grad_norm": 0.77734375, + "learning_rate": 9.044193548387097e-05, + "loss": 0.1618, + "step": 6427 + }, + { + "epoch": 0.102848, + "grad_norm": 0.859375, + "learning_rate": 9.044032258064516e-05, + "loss": 0.1578, + "step": 6428 + }, + { + "epoch": 0.102864, + "grad_norm": 0.84375, + "learning_rate": 9.043870967741936e-05, + "loss": 0.2047, + "step": 6429 + }, + { + "epoch": 0.10288, + "grad_norm": 1.0078125, + "learning_rate": 9.043709677419355e-05, + "loss": 0.1392, + "step": 6430 + }, + { + "epoch": 0.102896, + "grad_norm": 0.6328125, + "learning_rate": 9.043548387096775e-05, + "loss": 0.1154, + "step": 6431 + }, + { + "epoch": 0.102912, + "grad_norm": 1.4296875, + "learning_rate": 9.043387096774195e-05, + "loss": 0.2144, + "step": 6432 + }, + { + "epoch": 0.102928, + "grad_norm": 0.875, + "learning_rate": 9.043225806451613e-05, + "loss": 0.1864, + "step": 6433 + }, + { + "epoch": 0.102944, + "grad_norm": 0.8828125, + "learning_rate": 9.043064516129033e-05, + "loss": 0.2193, + "step": 6434 + }, + { + "epoch": 0.10296, + "grad_norm": 0.83203125, + "learning_rate": 9.042903225806452e-05, + "loss": 0.1679, + "step": 6435 + }, + { + "epoch": 0.102976, + "grad_norm": 0.578125, + "learning_rate": 9.042741935483872e-05, + "loss": 0.1583, + "step": 6436 + }, + { + "epoch": 0.102992, + "grad_norm": 0.6484375, + "learning_rate": 9.04258064516129e-05, + "loss": 0.1613, + "step": 6437 + }, + { + "epoch": 0.103008, + "grad_norm": 0.5546875, + "learning_rate": 9.04241935483871e-05, + "loss": 0.1615, + "step": 6438 + }, + { + "epoch": 0.103024, + "grad_norm": 0.765625, + "learning_rate": 9.042258064516129e-05, + "loss": 0.1833, + "step": 6439 + }, + { + "epoch": 0.10304, + "grad_norm": 1.0390625, + "learning_rate": 9.042096774193549e-05, + "loss": 0.248, + "step": 6440 + }, + { + "epoch": 0.103056, + "grad_norm": 0.8359375, + "learning_rate": 9.041935483870967e-05, + "loss": 0.1786, + "step": 6441 + }, + { + "epoch": 0.103072, + "grad_norm": 0.9765625, + "learning_rate": 9.041774193548387e-05, + "loss": 0.1761, + "step": 6442 + }, + { + "epoch": 0.103088, + "grad_norm": 1.1171875, + "learning_rate": 9.041612903225807e-05, + "loss": 0.1687, + "step": 6443 + }, + { + "epoch": 0.103104, + "grad_norm": 1.171875, + "learning_rate": 9.041451612903227e-05, + "loss": 0.1616, + "step": 6444 + }, + { + "epoch": 0.10312, + "grad_norm": 1.1953125, + "learning_rate": 9.041290322580646e-05, + "loss": 0.1949, + "step": 6445 + }, + { + "epoch": 0.103136, + "grad_norm": 1.4921875, + "learning_rate": 9.041129032258065e-05, + "loss": 0.2253, + "step": 6446 + }, + { + "epoch": 0.103152, + "grad_norm": 0.734375, + "learning_rate": 9.040967741935484e-05, + "loss": 0.1557, + "step": 6447 + }, + { + "epoch": 0.103168, + "grad_norm": 1.140625, + "learning_rate": 9.040806451612903e-05, + "loss": 0.1712, + "step": 6448 + }, + { + "epoch": 0.103184, + "grad_norm": 0.79296875, + "learning_rate": 9.040645161290323e-05, + "loss": 0.2208, + "step": 6449 + }, + { + "epoch": 0.1032, + "grad_norm": 0.9609375, + "learning_rate": 9.040483870967742e-05, + "loss": 0.2017, + "step": 6450 + }, + { + "epoch": 0.103216, + "grad_norm": 1.53125, + "learning_rate": 9.040322580645162e-05, + "loss": 0.1994, + "step": 6451 + }, + { + "epoch": 0.103232, + "grad_norm": 0.8046875, + "learning_rate": 9.04016129032258e-05, + "loss": 0.167, + "step": 6452 + }, + { + "epoch": 0.103248, + "grad_norm": 0.98828125, + "learning_rate": 9.04e-05, + "loss": 0.1705, + "step": 6453 + }, + { + "epoch": 0.103264, + "grad_norm": 0.65625, + "learning_rate": 9.03983870967742e-05, + "loss": 0.1895, + "step": 6454 + }, + { + "epoch": 0.10328, + "grad_norm": 1.171875, + "learning_rate": 9.03967741935484e-05, + "loss": 0.2033, + "step": 6455 + }, + { + "epoch": 0.103296, + "grad_norm": 0.53515625, + "learning_rate": 9.039516129032259e-05, + "loss": 0.1558, + "step": 6456 + }, + { + "epoch": 0.103312, + "grad_norm": 0.8046875, + "learning_rate": 9.039354838709679e-05, + "loss": 0.1577, + "step": 6457 + }, + { + "epoch": 0.103328, + "grad_norm": 0.734375, + "learning_rate": 9.039193548387097e-05, + "loss": 0.1678, + "step": 6458 + }, + { + "epoch": 0.103344, + "grad_norm": 1.4921875, + "learning_rate": 9.039032258064517e-05, + "loss": 0.1946, + "step": 6459 + }, + { + "epoch": 0.10336, + "grad_norm": 0.66796875, + "learning_rate": 9.038870967741936e-05, + "loss": 0.1663, + "step": 6460 + }, + { + "epoch": 0.103376, + "grad_norm": 1.1796875, + "learning_rate": 9.038709677419356e-05, + "loss": 0.1743, + "step": 6461 + }, + { + "epoch": 0.103392, + "grad_norm": 0.5859375, + "learning_rate": 9.038548387096774e-05, + "loss": 0.1537, + "step": 6462 + }, + { + "epoch": 0.103408, + "grad_norm": 0.703125, + "learning_rate": 9.038387096774193e-05, + "loss": 0.1276, + "step": 6463 + }, + { + "epoch": 0.103424, + "grad_norm": 0.64453125, + "learning_rate": 9.038225806451613e-05, + "loss": 0.1838, + "step": 6464 + }, + { + "epoch": 0.10344, + "grad_norm": 0.91015625, + "learning_rate": 9.038064516129032e-05, + "loss": 0.1514, + "step": 6465 + }, + { + "epoch": 0.103456, + "grad_norm": 0.66796875, + "learning_rate": 9.037903225806452e-05, + "loss": 0.181, + "step": 6466 + }, + { + "epoch": 0.103472, + "grad_norm": 0.5625, + "learning_rate": 9.037741935483872e-05, + "loss": 0.1669, + "step": 6467 + }, + { + "epoch": 0.103488, + "grad_norm": 0.5625, + "learning_rate": 9.037580645161291e-05, + "loss": 0.1712, + "step": 6468 + }, + { + "epoch": 0.103504, + "grad_norm": 0.72265625, + "learning_rate": 9.03741935483871e-05, + "loss": 0.1557, + "step": 6469 + }, + { + "epoch": 0.10352, + "grad_norm": 0.5546875, + "learning_rate": 9.03725806451613e-05, + "loss": 0.1651, + "step": 6470 + }, + { + "epoch": 0.103536, + "grad_norm": 0.71484375, + "learning_rate": 9.037096774193549e-05, + "loss": 0.2017, + "step": 6471 + }, + { + "epoch": 0.103552, + "grad_norm": 1.1171875, + "learning_rate": 9.036935483870969e-05, + "loss": 0.1431, + "step": 6472 + }, + { + "epoch": 0.103568, + "grad_norm": 0.84765625, + "learning_rate": 9.036774193548387e-05, + "loss": 0.1729, + "step": 6473 + }, + { + "epoch": 0.103584, + "grad_norm": 0.6328125, + "learning_rate": 9.036612903225807e-05, + "loss": 0.1584, + "step": 6474 + }, + { + "epoch": 0.1036, + "grad_norm": 0.8984375, + "learning_rate": 9.036451612903226e-05, + "loss": 0.1673, + "step": 6475 + }, + { + "epoch": 0.103616, + "grad_norm": 0.6484375, + "learning_rate": 9.036290322580646e-05, + "loss": 0.1356, + "step": 6476 + }, + { + "epoch": 0.103632, + "grad_norm": 0.8359375, + "learning_rate": 9.036129032258064e-05, + "loss": 0.1529, + "step": 6477 + }, + { + "epoch": 0.103648, + "grad_norm": 0.76171875, + "learning_rate": 9.035967741935484e-05, + "loss": 0.2063, + "step": 6478 + }, + { + "epoch": 0.103664, + "grad_norm": 0.828125, + "learning_rate": 9.035806451612904e-05, + "loss": 0.1962, + "step": 6479 + }, + { + "epoch": 0.10368, + "grad_norm": 1.8125, + "learning_rate": 9.035645161290323e-05, + "loss": 0.2151, + "step": 6480 + }, + { + "epoch": 0.103696, + "grad_norm": 0.67578125, + "learning_rate": 9.035483870967743e-05, + "loss": 0.1781, + "step": 6481 + }, + { + "epoch": 0.103712, + "grad_norm": 1.25, + "learning_rate": 9.035322580645161e-05, + "loss": 0.1548, + "step": 6482 + }, + { + "epoch": 0.103728, + "grad_norm": 1.0625, + "learning_rate": 9.035161290322581e-05, + "loss": 0.16, + "step": 6483 + }, + { + "epoch": 0.103744, + "grad_norm": 1.3203125, + "learning_rate": 9.035e-05, + "loss": 0.1416, + "step": 6484 + }, + { + "epoch": 0.10376, + "grad_norm": 1.21875, + "learning_rate": 9.03483870967742e-05, + "loss": 0.1565, + "step": 6485 + }, + { + "epoch": 0.103776, + "grad_norm": 1.03125, + "learning_rate": 9.034677419354839e-05, + "loss": 0.2127, + "step": 6486 + }, + { + "epoch": 0.103792, + "grad_norm": 0.9296875, + "learning_rate": 9.034516129032259e-05, + "loss": 0.1821, + "step": 6487 + }, + { + "epoch": 0.103808, + "grad_norm": 1.3515625, + "learning_rate": 9.034354838709677e-05, + "loss": 0.2078, + "step": 6488 + }, + { + "epoch": 0.103824, + "grad_norm": 1.2890625, + "learning_rate": 9.034193548387097e-05, + "loss": 0.1985, + "step": 6489 + }, + { + "epoch": 0.10384, + "grad_norm": 1.7890625, + "learning_rate": 9.034032258064517e-05, + "loss": 0.2126, + "step": 6490 + }, + { + "epoch": 0.103856, + "grad_norm": 0.90625, + "learning_rate": 9.033870967741937e-05, + "loss": 0.1512, + "step": 6491 + }, + { + "epoch": 0.103872, + "grad_norm": 1.1171875, + "learning_rate": 9.033709677419356e-05, + "loss": 0.1615, + "step": 6492 + }, + { + "epoch": 0.103888, + "grad_norm": 1.3671875, + "learning_rate": 9.033548387096774e-05, + "loss": 0.1716, + "step": 6493 + }, + { + "epoch": 0.103904, + "grad_norm": 1.703125, + "learning_rate": 9.033387096774194e-05, + "loss": 0.1962, + "step": 6494 + }, + { + "epoch": 0.10392, + "grad_norm": 0.94921875, + "learning_rate": 9.033225806451613e-05, + "loss": 0.1775, + "step": 6495 + }, + { + "epoch": 0.103936, + "grad_norm": 0.85546875, + "learning_rate": 9.033064516129033e-05, + "loss": 0.1833, + "step": 6496 + }, + { + "epoch": 0.103952, + "grad_norm": 0.89453125, + "learning_rate": 9.032903225806451e-05, + "loss": 0.2017, + "step": 6497 + }, + { + "epoch": 0.103968, + "grad_norm": 0.5390625, + "learning_rate": 9.032741935483871e-05, + "loss": 0.1326, + "step": 6498 + }, + { + "epoch": 0.103984, + "grad_norm": 0.58203125, + "learning_rate": 9.03258064516129e-05, + "loss": 0.1447, + "step": 6499 + }, + { + "epoch": 0.104, + "grad_norm": 1.0859375, + "learning_rate": 9.03241935483871e-05, + "loss": 0.1618, + "step": 6500 + }, + { + "epoch": 0.104016, + "grad_norm": 0.9296875, + "learning_rate": 9.032258064516129e-05, + "loss": 0.1587, + "step": 6501 + }, + { + "epoch": 0.104032, + "grad_norm": 1.7109375, + "learning_rate": 9.032096774193549e-05, + "loss": 0.1936, + "step": 6502 + }, + { + "epoch": 0.104048, + "grad_norm": 1.0703125, + "learning_rate": 9.031935483870969e-05, + "loss": 0.1676, + "step": 6503 + }, + { + "epoch": 0.104064, + "grad_norm": 0.53125, + "learning_rate": 9.031774193548388e-05, + "loss": 0.157, + "step": 6504 + }, + { + "epoch": 0.10408, + "grad_norm": 0.78515625, + "learning_rate": 9.031612903225807e-05, + "loss": 0.1966, + "step": 6505 + }, + { + "epoch": 0.104096, + "grad_norm": 0.5078125, + "learning_rate": 9.031451612903227e-05, + "loss": 0.158, + "step": 6506 + }, + { + "epoch": 0.104112, + "grad_norm": 0.6328125, + "learning_rate": 9.031290322580646e-05, + "loss": 0.188, + "step": 6507 + }, + { + "epoch": 0.104128, + "grad_norm": 1.0703125, + "learning_rate": 9.031129032258066e-05, + "loss": 0.1675, + "step": 6508 + }, + { + "epoch": 0.104144, + "grad_norm": 0.64453125, + "learning_rate": 9.030967741935484e-05, + "loss": 0.1396, + "step": 6509 + }, + { + "epoch": 0.10416, + "grad_norm": 0.65625, + "learning_rate": 9.030806451612903e-05, + "loss": 0.1458, + "step": 6510 + }, + { + "epoch": 0.104176, + "grad_norm": 0.7421875, + "learning_rate": 9.030645161290323e-05, + "loss": 0.2115, + "step": 6511 + }, + { + "epoch": 0.104192, + "grad_norm": 0.72265625, + "learning_rate": 9.030483870967741e-05, + "loss": 0.1678, + "step": 6512 + }, + { + "epoch": 0.104208, + "grad_norm": 0.81640625, + "learning_rate": 9.030322580645161e-05, + "loss": 0.1594, + "step": 6513 + }, + { + "epoch": 0.104224, + "grad_norm": 1.0, + "learning_rate": 9.030161290322581e-05, + "loss": 0.1484, + "step": 6514 + }, + { + "epoch": 0.10424, + "grad_norm": 0.61328125, + "learning_rate": 9.030000000000001e-05, + "loss": 0.1668, + "step": 6515 + }, + { + "epoch": 0.104256, + "grad_norm": 0.63671875, + "learning_rate": 9.02983870967742e-05, + "loss": 0.2031, + "step": 6516 + }, + { + "epoch": 0.104272, + "grad_norm": 0.640625, + "learning_rate": 9.02967741935484e-05, + "loss": 0.2165, + "step": 6517 + }, + { + "epoch": 0.104288, + "grad_norm": 0.71484375, + "learning_rate": 9.029516129032258e-05, + "loss": 0.1914, + "step": 6518 + }, + { + "epoch": 0.104304, + "grad_norm": 0.65234375, + "learning_rate": 9.029354838709678e-05, + "loss": 0.224, + "step": 6519 + }, + { + "epoch": 0.10432, + "grad_norm": 0.8515625, + "learning_rate": 9.029193548387097e-05, + "loss": 0.215, + "step": 6520 + }, + { + "epoch": 0.104336, + "grad_norm": 0.73046875, + "learning_rate": 9.029032258064517e-05, + "loss": 0.1678, + "step": 6521 + }, + { + "epoch": 0.104352, + "grad_norm": 1.015625, + "learning_rate": 9.028870967741936e-05, + "loss": 0.1613, + "step": 6522 + }, + { + "epoch": 0.104368, + "grad_norm": 0.796875, + "learning_rate": 9.028709677419356e-05, + "loss": 0.1624, + "step": 6523 + }, + { + "epoch": 0.104384, + "grad_norm": 0.8515625, + "learning_rate": 9.028548387096774e-05, + "loss": 0.2028, + "step": 6524 + }, + { + "epoch": 0.1044, + "grad_norm": 1.0234375, + "learning_rate": 9.028387096774194e-05, + "loss": 0.1731, + "step": 6525 + }, + { + "epoch": 0.104416, + "grad_norm": 0.59765625, + "learning_rate": 9.028225806451613e-05, + "loss": 0.1639, + "step": 6526 + }, + { + "epoch": 0.104432, + "grad_norm": 0.55859375, + "learning_rate": 9.028064516129033e-05, + "loss": 0.1467, + "step": 6527 + }, + { + "epoch": 0.104448, + "grad_norm": 1.15625, + "learning_rate": 9.027903225806453e-05, + "loss": 0.1909, + "step": 6528 + }, + { + "epoch": 0.104464, + "grad_norm": 0.8671875, + "learning_rate": 9.027741935483871e-05, + "loss": 0.1754, + "step": 6529 + }, + { + "epoch": 0.10448, + "grad_norm": 0.828125, + "learning_rate": 9.027580645161291e-05, + "loss": 0.2116, + "step": 6530 + }, + { + "epoch": 0.104496, + "grad_norm": 0.75, + "learning_rate": 9.02741935483871e-05, + "loss": 0.187, + "step": 6531 + }, + { + "epoch": 0.104512, + "grad_norm": 0.8671875, + "learning_rate": 9.02725806451613e-05, + "loss": 0.1784, + "step": 6532 + }, + { + "epoch": 0.104528, + "grad_norm": 1.078125, + "learning_rate": 9.027096774193548e-05, + "loss": 0.18, + "step": 6533 + }, + { + "epoch": 0.104544, + "grad_norm": 0.73046875, + "learning_rate": 9.026935483870968e-05, + "loss": 0.1471, + "step": 6534 + }, + { + "epoch": 0.10456, + "grad_norm": 1.1328125, + "learning_rate": 9.026774193548387e-05, + "loss": 0.1951, + "step": 6535 + }, + { + "epoch": 0.104576, + "grad_norm": 0.48828125, + "learning_rate": 9.026612903225807e-05, + "loss": 0.1363, + "step": 6536 + }, + { + "epoch": 0.104592, + "grad_norm": 0.83203125, + "learning_rate": 9.026451612903226e-05, + "loss": 0.1957, + "step": 6537 + }, + { + "epoch": 0.104608, + "grad_norm": 0.625, + "learning_rate": 9.026290322580646e-05, + "loss": 0.1635, + "step": 6538 + }, + { + "epoch": 0.104624, + "grad_norm": 0.62109375, + "learning_rate": 9.026129032258065e-05, + "loss": 0.1499, + "step": 6539 + }, + { + "epoch": 0.10464, + "grad_norm": 1.40625, + "learning_rate": 9.025967741935484e-05, + "loss": 0.2084, + "step": 6540 + }, + { + "epoch": 0.104656, + "grad_norm": 1.21875, + "learning_rate": 9.025806451612904e-05, + "loss": 0.193, + "step": 6541 + }, + { + "epoch": 0.104672, + "grad_norm": 0.70703125, + "learning_rate": 9.025645161290323e-05, + "loss": 0.1922, + "step": 6542 + }, + { + "epoch": 0.104688, + "grad_norm": 0.67578125, + "learning_rate": 9.025483870967743e-05, + "loss": 0.1833, + "step": 6543 + }, + { + "epoch": 0.104704, + "grad_norm": 0.83984375, + "learning_rate": 9.025322580645161e-05, + "loss": 0.1757, + "step": 6544 + }, + { + "epoch": 0.10472, + "grad_norm": 0.90234375, + "learning_rate": 9.025161290322581e-05, + "loss": 0.2048, + "step": 6545 + }, + { + "epoch": 0.104736, + "grad_norm": 0.83984375, + "learning_rate": 9.025e-05, + "loss": 0.1928, + "step": 6546 + }, + { + "epoch": 0.104752, + "grad_norm": 0.93359375, + "learning_rate": 9.02483870967742e-05, + "loss": 0.2162, + "step": 6547 + }, + { + "epoch": 0.104768, + "grad_norm": 1.578125, + "learning_rate": 9.024677419354838e-05, + "loss": 0.1771, + "step": 6548 + }, + { + "epoch": 0.104784, + "grad_norm": 0.55078125, + "learning_rate": 9.024516129032258e-05, + "loss": 0.1721, + "step": 6549 + }, + { + "epoch": 0.1048, + "grad_norm": 0.73828125, + "learning_rate": 9.024354838709678e-05, + "loss": 0.1983, + "step": 6550 + }, + { + "epoch": 0.104816, + "grad_norm": 1.0859375, + "learning_rate": 9.024193548387098e-05, + "loss": 0.2093, + "step": 6551 + }, + { + "epoch": 0.104832, + "grad_norm": 0.77734375, + "learning_rate": 9.024032258064517e-05, + "loss": 0.188, + "step": 6552 + }, + { + "epoch": 0.104848, + "grad_norm": 0.73046875, + "learning_rate": 9.023870967741937e-05, + "loss": 0.1996, + "step": 6553 + }, + { + "epoch": 0.104864, + "grad_norm": 0.8671875, + "learning_rate": 9.023709677419355e-05, + "loss": 0.2192, + "step": 6554 + }, + { + "epoch": 0.10488, + "grad_norm": 0.84375, + "learning_rate": 9.023548387096774e-05, + "loss": 0.1839, + "step": 6555 + }, + { + "epoch": 0.104896, + "grad_norm": 0.53515625, + "learning_rate": 9.023387096774194e-05, + "loss": 0.1577, + "step": 6556 + }, + { + "epoch": 0.104912, + "grad_norm": 0.7734375, + "learning_rate": 9.023225806451613e-05, + "loss": 0.1896, + "step": 6557 + }, + { + "epoch": 0.104928, + "grad_norm": 0.71484375, + "learning_rate": 9.023064516129033e-05, + "loss": 0.1909, + "step": 6558 + }, + { + "epoch": 0.104944, + "grad_norm": 0.6015625, + "learning_rate": 9.022903225806451e-05, + "loss": 0.1838, + "step": 6559 + }, + { + "epoch": 0.10496, + "grad_norm": 0.69921875, + "learning_rate": 9.022741935483871e-05, + "loss": 0.1993, + "step": 6560 + }, + { + "epoch": 0.104976, + "grad_norm": 0.91015625, + "learning_rate": 9.02258064516129e-05, + "loss": 0.1594, + "step": 6561 + }, + { + "epoch": 0.104992, + "grad_norm": 0.64453125, + "learning_rate": 9.02241935483871e-05, + "loss": 0.1862, + "step": 6562 + }, + { + "epoch": 0.105008, + "grad_norm": 0.640625, + "learning_rate": 9.02225806451613e-05, + "loss": 0.1941, + "step": 6563 + }, + { + "epoch": 0.105024, + "grad_norm": 0.75390625, + "learning_rate": 9.02209677419355e-05, + "loss": 0.1428, + "step": 6564 + }, + { + "epoch": 0.10504, + "grad_norm": 0.9140625, + "learning_rate": 9.021935483870968e-05, + "loss": 0.2243, + "step": 6565 + }, + { + "epoch": 0.105056, + "grad_norm": 1.0546875, + "learning_rate": 9.021774193548388e-05, + "loss": 0.2169, + "step": 6566 + }, + { + "epoch": 0.105072, + "grad_norm": 0.78125, + "learning_rate": 9.021612903225807e-05, + "loss": 0.1591, + "step": 6567 + }, + { + "epoch": 0.105088, + "grad_norm": 0.6015625, + "learning_rate": 9.021451612903227e-05, + "loss": 0.1722, + "step": 6568 + }, + { + "epoch": 0.105104, + "grad_norm": 0.61328125, + "learning_rate": 9.021290322580645e-05, + "loss": 0.1828, + "step": 6569 + }, + { + "epoch": 0.10512, + "grad_norm": 1.5703125, + "learning_rate": 9.021129032258065e-05, + "loss": 0.1795, + "step": 6570 + }, + { + "epoch": 0.105136, + "grad_norm": 0.86328125, + "learning_rate": 9.020967741935484e-05, + "loss": 0.1851, + "step": 6571 + }, + { + "epoch": 0.105152, + "grad_norm": 0.6875, + "learning_rate": 9.020806451612903e-05, + "loss": 0.164, + "step": 6572 + }, + { + "epoch": 0.105168, + "grad_norm": 1.3203125, + "learning_rate": 9.020645161290323e-05, + "loss": 0.1942, + "step": 6573 + }, + { + "epoch": 0.105184, + "grad_norm": 0.65625, + "learning_rate": 9.020483870967743e-05, + "loss": 0.1662, + "step": 6574 + }, + { + "epoch": 0.1052, + "grad_norm": 0.9296875, + "learning_rate": 9.020322580645162e-05, + "loss": 0.1676, + "step": 6575 + }, + { + "epoch": 0.105216, + "grad_norm": 0.86328125, + "learning_rate": 9.020161290322581e-05, + "loss": 0.1522, + "step": 6576 + }, + { + "epoch": 0.105232, + "grad_norm": 0.74609375, + "learning_rate": 9.020000000000001e-05, + "loss": 0.1837, + "step": 6577 + }, + { + "epoch": 0.105248, + "grad_norm": 0.734375, + "learning_rate": 9.01983870967742e-05, + "loss": 0.165, + "step": 6578 + }, + { + "epoch": 0.105264, + "grad_norm": 1.015625, + "learning_rate": 9.01967741935484e-05, + "loss": 0.2135, + "step": 6579 + }, + { + "epoch": 0.10528, + "grad_norm": 0.93359375, + "learning_rate": 9.019516129032258e-05, + "loss": 0.1963, + "step": 6580 + }, + { + "epoch": 0.105296, + "grad_norm": 1.59375, + "learning_rate": 9.019354838709678e-05, + "loss": 0.187, + "step": 6581 + }, + { + "epoch": 0.105312, + "grad_norm": 0.79296875, + "learning_rate": 9.019193548387097e-05, + "loss": 0.1833, + "step": 6582 + }, + { + "epoch": 0.105328, + "grad_norm": 1.015625, + "learning_rate": 9.019032258064517e-05, + "loss": 0.158, + "step": 6583 + }, + { + "epoch": 0.105344, + "grad_norm": 1.0078125, + "learning_rate": 9.018870967741935e-05, + "loss": 0.1655, + "step": 6584 + }, + { + "epoch": 0.10536, + "grad_norm": 0.87109375, + "learning_rate": 9.018709677419355e-05, + "loss": 0.2092, + "step": 6585 + }, + { + "epoch": 0.105376, + "grad_norm": 0.9609375, + "learning_rate": 9.018548387096775e-05, + "loss": 0.1966, + "step": 6586 + }, + { + "epoch": 0.105392, + "grad_norm": 0.796875, + "learning_rate": 9.018387096774194e-05, + "loss": 0.195, + "step": 6587 + }, + { + "epoch": 0.105408, + "grad_norm": 0.58984375, + "learning_rate": 9.018225806451614e-05, + "loss": 0.1791, + "step": 6588 + }, + { + "epoch": 0.105424, + "grad_norm": 1.171875, + "learning_rate": 9.018064516129032e-05, + "loss": 0.1927, + "step": 6589 + }, + { + "epoch": 0.10544, + "grad_norm": 0.80078125, + "learning_rate": 9.017903225806452e-05, + "loss": 0.1721, + "step": 6590 + }, + { + "epoch": 0.105456, + "grad_norm": 1.0234375, + "learning_rate": 9.017741935483871e-05, + "loss": 0.1715, + "step": 6591 + }, + { + "epoch": 0.105472, + "grad_norm": 0.87890625, + "learning_rate": 9.017580645161291e-05, + "loss": 0.1618, + "step": 6592 + }, + { + "epoch": 0.105488, + "grad_norm": 0.765625, + "learning_rate": 9.01741935483871e-05, + "loss": 0.1905, + "step": 6593 + }, + { + "epoch": 0.105504, + "grad_norm": 0.609375, + "learning_rate": 9.01725806451613e-05, + "loss": 0.1777, + "step": 6594 + }, + { + "epoch": 0.10552, + "grad_norm": 0.74609375, + "learning_rate": 9.017096774193548e-05, + "loss": 0.1676, + "step": 6595 + }, + { + "epoch": 0.105536, + "grad_norm": 0.703125, + "learning_rate": 9.016935483870968e-05, + "loss": 0.1726, + "step": 6596 + }, + { + "epoch": 0.105552, + "grad_norm": 0.765625, + "learning_rate": 9.016774193548387e-05, + "loss": 0.1781, + "step": 6597 + }, + { + "epoch": 0.105568, + "grad_norm": 0.69140625, + "learning_rate": 9.016612903225807e-05, + "loss": 0.1285, + "step": 6598 + }, + { + "epoch": 0.105584, + "grad_norm": 0.625, + "learning_rate": 9.016451612903227e-05, + "loss": 0.1476, + "step": 6599 + }, + { + "epoch": 0.1056, + "grad_norm": 0.68359375, + "learning_rate": 9.016290322580647e-05, + "loss": 0.2011, + "step": 6600 + }, + { + "epoch": 0.105616, + "grad_norm": 0.78125, + "learning_rate": 9.016129032258065e-05, + "loss": 0.1509, + "step": 6601 + }, + { + "epoch": 0.105632, + "grad_norm": 0.46484375, + "learning_rate": 9.015967741935484e-05, + "loss": 0.1457, + "step": 6602 + }, + { + "epoch": 0.105648, + "grad_norm": 0.671875, + "learning_rate": 9.015806451612904e-05, + "loss": 0.1991, + "step": 6603 + }, + { + "epoch": 0.105664, + "grad_norm": 0.89453125, + "learning_rate": 9.015645161290322e-05, + "loss": 0.1649, + "step": 6604 + }, + { + "epoch": 0.10568, + "grad_norm": 0.7890625, + "learning_rate": 9.015483870967742e-05, + "loss": 0.2096, + "step": 6605 + }, + { + "epoch": 0.105696, + "grad_norm": 0.58203125, + "learning_rate": 9.015322580645161e-05, + "loss": 0.138, + "step": 6606 + }, + { + "epoch": 0.105712, + "grad_norm": 0.71875, + "learning_rate": 9.015161290322581e-05, + "loss": 0.1855, + "step": 6607 + }, + { + "epoch": 0.105728, + "grad_norm": 0.69140625, + "learning_rate": 9.015e-05, + "loss": 0.1578, + "step": 6608 + }, + { + "epoch": 0.105744, + "grad_norm": 1.3125, + "learning_rate": 9.01483870967742e-05, + "loss": 0.1628, + "step": 6609 + }, + { + "epoch": 0.10576, + "grad_norm": 0.96875, + "learning_rate": 9.01467741935484e-05, + "loss": 0.1829, + "step": 6610 + }, + { + "epoch": 0.105776, + "grad_norm": 0.6953125, + "learning_rate": 9.01451612903226e-05, + "loss": 0.2018, + "step": 6611 + }, + { + "epoch": 0.105792, + "grad_norm": 0.875, + "learning_rate": 9.014354838709678e-05, + "loss": 0.2042, + "step": 6612 + }, + { + "epoch": 0.105808, + "grad_norm": 1.25, + "learning_rate": 9.014193548387098e-05, + "loss": 0.1887, + "step": 6613 + }, + { + "epoch": 0.105824, + "grad_norm": 0.734375, + "learning_rate": 9.014032258064517e-05, + "loss": 0.1892, + "step": 6614 + }, + { + "epoch": 0.10584, + "grad_norm": 1.671875, + "learning_rate": 9.013870967741937e-05, + "loss": 0.1933, + "step": 6615 + }, + { + "epoch": 0.105856, + "grad_norm": 1.0, + "learning_rate": 9.013709677419355e-05, + "loss": 0.1484, + "step": 6616 + }, + { + "epoch": 0.105872, + "grad_norm": 0.74609375, + "learning_rate": 9.013548387096774e-05, + "loss": 0.1474, + "step": 6617 + }, + { + "epoch": 0.105888, + "grad_norm": 0.91796875, + "learning_rate": 9.013387096774194e-05, + "loss": 0.1654, + "step": 6618 + }, + { + "epoch": 0.105904, + "grad_norm": 1.359375, + "learning_rate": 9.013225806451612e-05, + "loss": 0.1805, + "step": 6619 + }, + { + "epoch": 0.10592, + "grad_norm": 0.7734375, + "learning_rate": 9.013064516129032e-05, + "loss": 0.1626, + "step": 6620 + }, + { + "epoch": 0.105936, + "grad_norm": 0.59765625, + "learning_rate": 9.012903225806451e-05, + "loss": 0.1679, + "step": 6621 + }, + { + "epoch": 0.105952, + "grad_norm": 0.73828125, + "learning_rate": 9.012741935483871e-05, + "loss": 0.1991, + "step": 6622 + }, + { + "epoch": 0.105968, + "grad_norm": 0.88671875, + "learning_rate": 9.012580645161291e-05, + "loss": 0.1707, + "step": 6623 + }, + { + "epoch": 0.105984, + "grad_norm": 1.0859375, + "learning_rate": 9.012419354838711e-05, + "loss": 0.219, + "step": 6624 + }, + { + "epoch": 0.106, + "grad_norm": 1.5078125, + "learning_rate": 9.01225806451613e-05, + "loss": 0.2014, + "step": 6625 + }, + { + "epoch": 0.106016, + "grad_norm": 0.57421875, + "learning_rate": 9.01209677419355e-05, + "loss": 0.1529, + "step": 6626 + }, + { + "epoch": 0.106032, + "grad_norm": 0.88671875, + "learning_rate": 9.011935483870968e-05, + "loss": 0.1957, + "step": 6627 + }, + { + "epoch": 0.106048, + "grad_norm": 1.1171875, + "learning_rate": 9.011774193548388e-05, + "loss": 0.188, + "step": 6628 + }, + { + "epoch": 0.106064, + "grad_norm": 0.953125, + "learning_rate": 9.011612903225807e-05, + "loss": 0.1728, + "step": 6629 + }, + { + "epoch": 0.10608, + "grad_norm": 0.78125, + "learning_rate": 9.011451612903227e-05, + "loss": 0.2003, + "step": 6630 + }, + { + "epoch": 0.106096, + "grad_norm": 0.6015625, + "learning_rate": 9.011290322580645e-05, + "loss": 0.184, + "step": 6631 + }, + { + "epoch": 0.106112, + "grad_norm": 0.65625, + "learning_rate": 9.011129032258065e-05, + "loss": 0.1578, + "step": 6632 + }, + { + "epoch": 0.106128, + "grad_norm": 0.8046875, + "learning_rate": 9.010967741935484e-05, + "loss": 0.2216, + "step": 6633 + }, + { + "epoch": 0.106144, + "grad_norm": 0.8671875, + "learning_rate": 9.010806451612904e-05, + "loss": 0.1664, + "step": 6634 + }, + { + "epoch": 0.10616, + "grad_norm": 0.890625, + "learning_rate": 9.010645161290324e-05, + "loss": 0.2108, + "step": 6635 + }, + { + "epoch": 0.106176, + "grad_norm": 1.1015625, + "learning_rate": 9.010483870967742e-05, + "loss": 0.1611, + "step": 6636 + }, + { + "epoch": 0.106192, + "grad_norm": 1.34375, + "learning_rate": 9.010322580645162e-05, + "loss": 0.1264, + "step": 6637 + }, + { + "epoch": 0.106208, + "grad_norm": 0.7734375, + "learning_rate": 9.010161290322581e-05, + "loss": 0.1584, + "step": 6638 + }, + { + "epoch": 0.106224, + "grad_norm": 0.734375, + "learning_rate": 9.010000000000001e-05, + "loss": 0.1541, + "step": 6639 + }, + { + "epoch": 0.10624, + "grad_norm": 0.55859375, + "learning_rate": 9.00983870967742e-05, + "loss": 0.1278, + "step": 6640 + }, + { + "epoch": 0.106256, + "grad_norm": 0.78125, + "learning_rate": 9.00967741935484e-05, + "loss": 0.1788, + "step": 6641 + }, + { + "epoch": 0.106272, + "grad_norm": 1.5625, + "learning_rate": 9.009516129032258e-05, + "loss": 0.1904, + "step": 6642 + }, + { + "epoch": 0.106288, + "grad_norm": 0.67578125, + "learning_rate": 9.009354838709678e-05, + "loss": 0.1945, + "step": 6643 + }, + { + "epoch": 0.106304, + "grad_norm": 1.046875, + "learning_rate": 9.009193548387097e-05, + "loss": 0.2313, + "step": 6644 + }, + { + "epoch": 0.10632, + "grad_norm": 0.9921875, + "learning_rate": 9.009032258064517e-05, + "loss": 0.2015, + "step": 6645 + }, + { + "epoch": 0.106336, + "grad_norm": 0.7265625, + "learning_rate": 9.008870967741936e-05, + "loss": 0.1967, + "step": 6646 + }, + { + "epoch": 0.106352, + "grad_norm": 0.90625, + "learning_rate": 9.008709677419356e-05, + "loss": 0.2053, + "step": 6647 + }, + { + "epoch": 0.106368, + "grad_norm": 0.71484375, + "learning_rate": 9.008548387096775e-05, + "loss": 0.1879, + "step": 6648 + }, + { + "epoch": 0.106384, + "grad_norm": 0.8984375, + "learning_rate": 9.008387096774194e-05, + "loss": 0.2122, + "step": 6649 + }, + { + "epoch": 0.1064, + "grad_norm": 0.9921875, + "learning_rate": 9.008225806451614e-05, + "loss": 0.1569, + "step": 6650 + }, + { + "epoch": 0.106416, + "grad_norm": 0.83203125, + "learning_rate": 9.008064516129032e-05, + "loss": 0.2115, + "step": 6651 + }, + { + "epoch": 0.106432, + "grad_norm": 0.58984375, + "learning_rate": 9.007903225806452e-05, + "loss": 0.1942, + "step": 6652 + }, + { + "epoch": 0.106448, + "grad_norm": 0.609375, + "learning_rate": 9.007741935483871e-05, + "loss": 0.1669, + "step": 6653 + }, + { + "epoch": 0.106464, + "grad_norm": 0.7265625, + "learning_rate": 9.007580645161291e-05, + "loss": 0.2116, + "step": 6654 + }, + { + "epoch": 0.10648, + "grad_norm": 1.3515625, + "learning_rate": 9.00741935483871e-05, + "loss": 0.1854, + "step": 6655 + }, + { + "epoch": 0.106496, + "grad_norm": 0.765625, + "learning_rate": 9.00725806451613e-05, + "loss": 0.187, + "step": 6656 + }, + { + "epoch": 0.106512, + "grad_norm": 0.9609375, + "learning_rate": 9.007096774193548e-05, + "loss": 0.1499, + "step": 6657 + }, + { + "epoch": 0.106528, + "grad_norm": 0.8125, + "learning_rate": 9.006935483870968e-05, + "loss": 0.1911, + "step": 6658 + }, + { + "epoch": 0.106544, + "grad_norm": 0.65234375, + "learning_rate": 9.006774193548388e-05, + "loss": 0.167, + "step": 6659 + }, + { + "epoch": 0.10656, + "grad_norm": 0.8515625, + "learning_rate": 9.006612903225808e-05, + "loss": 0.2066, + "step": 6660 + }, + { + "epoch": 0.106576, + "grad_norm": 0.765625, + "learning_rate": 9.006451612903226e-05, + "loss": 0.1644, + "step": 6661 + }, + { + "epoch": 0.106592, + "grad_norm": 0.7109375, + "learning_rate": 9.006290322580646e-05, + "loss": 0.1778, + "step": 6662 + }, + { + "epoch": 0.106608, + "grad_norm": 0.5625, + "learning_rate": 9.006129032258065e-05, + "loss": 0.1557, + "step": 6663 + }, + { + "epoch": 0.106624, + "grad_norm": 0.82421875, + "learning_rate": 9.005967741935484e-05, + "loss": 0.1362, + "step": 6664 + }, + { + "epoch": 0.10664, + "grad_norm": 0.734375, + "learning_rate": 9.005806451612904e-05, + "loss": 0.1873, + "step": 6665 + }, + { + "epoch": 0.106656, + "grad_norm": 0.85546875, + "learning_rate": 9.005645161290322e-05, + "loss": 0.1845, + "step": 6666 + }, + { + "epoch": 0.106672, + "grad_norm": 1.7578125, + "learning_rate": 9.005483870967742e-05, + "loss": 0.2364, + "step": 6667 + }, + { + "epoch": 0.106688, + "grad_norm": 1.1875, + "learning_rate": 9.005322580645161e-05, + "loss": 0.1747, + "step": 6668 + }, + { + "epoch": 0.106704, + "grad_norm": 0.88671875, + "learning_rate": 9.005161290322581e-05, + "loss": 0.1825, + "step": 6669 + }, + { + "epoch": 0.10672, + "grad_norm": 0.68359375, + "learning_rate": 9.005000000000001e-05, + "loss": 0.2082, + "step": 6670 + }, + { + "epoch": 0.106736, + "grad_norm": 0.87109375, + "learning_rate": 9.00483870967742e-05, + "loss": 0.1786, + "step": 6671 + }, + { + "epoch": 0.106752, + "grad_norm": 0.98828125, + "learning_rate": 9.004677419354839e-05, + "loss": 0.206, + "step": 6672 + }, + { + "epoch": 0.106768, + "grad_norm": 0.9921875, + "learning_rate": 9.004516129032259e-05, + "loss": 0.1718, + "step": 6673 + }, + { + "epoch": 0.106784, + "grad_norm": 0.93359375, + "learning_rate": 9.004354838709678e-05, + "loss": 0.147, + "step": 6674 + }, + { + "epoch": 0.1068, + "grad_norm": 1.09375, + "learning_rate": 9.004193548387098e-05, + "loss": 0.1555, + "step": 6675 + }, + { + "epoch": 0.106816, + "grad_norm": 0.87890625, + "learning_rate": 9.004032258064516e-05, + "loss": 0.1827, + "step": 6676 + }, + { + "epoch": 0.106832, + "grad_norm": 0.64453125, + "learning_rate": 9.003870967741936e-05, + "loss": 0.1941, + "step": 6677 + }, + { + "epoch": 0.106848, + "grad_norm": 0.79296875, + "learning_rate": 9.003709677419355e-05, + "loss": 0.2028, + "step": 6678 + }, + { + "epoch": 0.106864, + "grad_norm": 0.76171875, + "learning_rate": 9.003548387096775e-05, + "loss": 0.149, + "step": 6679 + }, + { + "epoch": 0.10688, + "grad_norm": 0.78125, + "learning_rate": 9.003387096774194e-05, + "loss": 0.1914, + "step": 6680 + }, + { + "epoch": 0.106896, + "grad_norm": 1.0, + "learning_rate": 9.003225806451614e-05, + "loss": 0.2235, + "step": 6681 + }, + { + "epoch": 0.106912, + "grad_norm": 0.74609375, + "learning_rate": 9.003064516129032e-05, + "loss": 0.1661, + "step": 6682 + }, + { + "epoch": 0.106928, + "grad_norm": 1.0234375, + "learning_rate": 9.002903225806452e-05, + "loss": 0.1905, + "step": 6683 + }, + { + "epoch": 0.106944, + "grad_norm": 0.57421875, + "learning_rate": 9.002741935483872e-05, + "loss": 0.146, + "step": 6684 + }, + { + "epoch": 0.10696, + "grad_norm": 0.5625, + "learning_rate": 9.00258064516129e-05, + "loss": 0.1834, + "step": 6685 + }, + { + "epoch": 0.106976, + "grad_norm": 1.3671875, + "learning_rate": 9.00241935483871e-05, + "loss": 0.1948, + "step": 6686 + }, + { + "epoch": 0.106992, + "grad_norm": 0.85546875, + "learning_rate": 9.002258064516129e-05, + "loss": 0.1848, + "step": 6687 + }, + { + "epoch": 0.107008, + "grad_norm": 0.84765625, + "learning_rate": 9.002096774193549e-05, + "loss": 0.1612, + "step": 6688 + }, + { + "epoch": 0.107024, + "grad_norm": 0.80859375, + "learning_rate": 9.001935483870968e-05, + "loss": 0.1942, + "step": 6689 + }, + { + "epoch": 0.10704, + "grad_norm": 1.0703125, + "learning_rate": 9.001774193548388e-05, + "loss": 0.1825, + "step": 6690 + }, + { + "epoch": 0.107056, + "grad_norm": 0.62109375, + "learning_rate": 9.001612903225806e-05, + "loss": 0.157, + "step": 6691 + }, + { + "epoch": 0.107072, + "grad_norm": 0.9296875, + "learning_rate": 9.001451612903226e-05, + "loss": 0.1969, + "step": 6692 + }, + { + "epoch": 0.107088, + "grad_norm": 0.9375, + "learning_rate": 9.001290322580645e-05, + "loss": 0.2219, + "step": 6693 + }, + { + "epoch": 0.107104, + "grad_norm": 1.0546875, + "learning_rate": 9.001129032258065e-05, + "loss": 0.1861, + "step": 6694 + }, + { + "epoch": 0.10712, + "grad_norm": 1.0859375, + "learning_rate": 9.000967741935485e-05, + "loss": 0.2049, + "step": 6695 + }, + { + "epoch": 0.107136, + "grad_norm": 0.60546875, + "learning_rate": 9.000806451612903e-05, + "loss": 0.1661, + "step": 6696 + }, + { + "epoch": 0.107152, + "grad_norm": 0.73828125, + "learning_rate": 9.000645161290323e-05, + "loss": 0.1404, + "step": 6697 + }, + { + "epoch": 0.107168, + "grad_norm": 1.0546875, + "learning_rate": 9.000483870967742e-05, + "loss": 0.2318, + "step": 6698 + }, + { + "epoch": 0.107184, + "grad_norm": 0.84765625, + "learning_rate": 9.000322580645162e-05, + "loss": 0.1627, + "step": 6699 + }, + { + "epoch": 0.1072, + "grad_norm": 0.86328125, + "learning_rate": 9.00016129032258e-05, + "loss": 0.1831, + "step": 6700 + }, + { + "epoch": 0.107216, + "grad_norm": 0.70703125, + "learning_rate": 9e-05, + "loss": 0.1608, + "step": 6701 + }, + { + "epoch": 0.107232, + "grad_norm": 0.59375, + "learning_rate": 8.999838709677419e-05, + "loss": 0.1693, + "step": 6702 + }, + { + "epoch": 0.107248, + "grad_norm": 0.7734375, + "learning_rate": 8.999677419354839e-05, + "loss": 0.1635, + "step": 6703 + }, + { + "epoch": 0.107264, + "grad_norm": 0.64453125, + "learning_rate": 8.999516129032258e-05, + "loss": 0.1949, + "step": 6704 + }, + { + "epoch": 0.10728, + "grad_norm": 0.76953125, + "learning_rate": 8.999354838709678e-05, + "loss": 0.2034, + "step": 6705 + }, + { + "epoch": 0.107296, + "grad_norm": 0.77734375, + "learning_rate": 8.999193548387098e-05, + "loss": 0.1579, + "step": 6706 + }, + { + "epoch": 0.107312, + "grad_norm": 1.015625, + "learning_rate": 8.999032258064518e-05, + "loss": 0.1988, + "step": 6707 + }, + { + "epoch": 0.107328, + "grad_norm": 0.8046875, + "learning_rate": 8.998870967741936e-05, + "loss": 0.1553, + "step": 6708 + }, + { + "epoch": 0.107344, + "grad_norm": 0.8125, + "learning_rate": 8.998709677419356e-05, + "loss": 0.1324, + "step": 6709 + }, + { + "epoch": 0.10736, + "grad_norm": 0.8125, + "learning_rate": 8.998548387096775e-05, + "loss": 0.1536, + "step": 6710 + }, + { + "epoch": 0.107376, + "grad_norm": 0.828125, + "learning_rate": 8.998387096774193e-05, + "loss": 0.2035, + "step": 6711 + }, + { + "epoch": 0.107392, + "grad_norm": 0.66796875, + "learning_rate": 8.998225806451613e-05, + "loss": 0.1622, + "step": 6712 + }, + { + "epoch": 0.107408, + "grad_norm": 0.76953125, + "learning_rate": 8.998064516129032e-05, + "loss": 0.2198, + "step": 6713 + }, + { + "epoch": 0.107424, + "grad_norm": 0.70703125, + "learning_rate": 8.997903225806452e-05, + "loss": 0.1722, + "step": 6714 + }, + { + "epoch": 0.10744, + "grad_norm": 0.75, + "learning_rate": 8.99774193548387e-05, + "loss": 0.1469, + "step": 6715 + }, + { + "epoch": 0.107456, + "grad_norm": 0.92578125, + "learning_rate": 8.99758064516129e-05, + "loss": 0.1873, + "step": 6716 + }, + { + "epoch": 0.107472, + "grad_norm": 0.85546875, + "learning_rate": 8.997419354838709e-05, + "loss": 0.1703, + "step": 6717 + }, + { + "epoch": 0.107488, + "grad_norm": 0.77734375, + "learning_rate": 8.997258064516129e-05, + "loss": 0.199, + "step": 6718 + }, + { + "epoch": 0.107504, + "grad_norm": 0.734375, + "learning_rate": 8.997096774193549e-05, + "loss": 0.154, + "step": 6719 + }, + { + "epoch": 0.10752, + "grad_norm": 0.90625, + "learning_rate": 8.996935483870969e-05, + "loss": 0.1516, + "step": 6720 + }, + { + "epoch": 0.107536, + "grad_norm": 0.76171875, + "learning_rate": 8.996774193548388e-05, + "loss": 0.119, + "step": 6721 + }, + { + "epoch": 0.107552, + "grad_norm": 0.80078125, + "learning_rate": 8.996612903225808e-05, + "loss": 0.1488, + "step": 6722 + }, + { + "epoch": 0.107568, + "grad_norm": 0.61328125, + "learning_rate": 8.996451612903226e-05, + "loss": 0.1971, + "step": 6723 + }, + { + "epoch": 0.107584, + "grad_norm": 1.0546875, + "learning_rate": 8.996290322580646e-05, + "loss": 0.2122, + "step": 6724 + }, + { + "epoch": 0.1076, + "grad_norm": 0.76953125, + "learning_rate": 8.996129032258065e-05, + "loss": 0.1845, + "step": 6725 + }, + { + "epoch": 0.107616, + "grad_norm": 0.6171875, + "learning_rate": 8.995967741935483e-05, + "loss": 0.1589, + "step": 6726 + }, + { + "epoch": 0.107632, + "grad_norm": 0.83203125, + "learning_rate": 8.995806451612903e-05, + "loss": 0.1536, + "step": 6727 + }, + { + "epoch": 0.107648, + "grad_norm": 0.921875, + "learning_rate": 8.995645161290322e-05, + "loss": 0.2136, + "step": 6728 + }, + { + "epoch": 0.107664, + "grad_norm": 0.76171875, + "learning_rate": 8.995483870967742e-05, + "loss": 0.1749, + "step": 6729 + }, + { + "epoch": 0.10768, + "grad_norm": 0.9921875, + "learning_rate": 8.995322580645162e-05, + "loss": 0.1982, + "step": 6730 + }, + { + "epoch": 0.107696, + "grad_norm": 0.7109375, + "learning_rate": 8.995161290322582e-05, + "loss": 0.1901, + "step": 6731 + }, + { + "epoch": 0.107712, + "grad_norm": 0.703125, + "learning_rate": 8.995e-05, + "loss": 0.1754, + "step": 6732 + }, + { + "epoch": 0.107728, + "grad_norm": 0.79296875, + "learning_rate": 8.99483870967742e-05, + "loss": 0.2366, + "step": 6733 + }, + { + "epoch": 0.107744, + "grad_norm": 0.69140625, + "learning_rate": 8.994677419354839e-05, + "loss": 0.1352, + "step": 6734 + }, + { + "epoch": 0.10776, + "grad_norm": 0.921875, + "learning_rate": 8.994516129032259e-05, + "loss": 0.2044, + "step": 6735 + }, + { + "epoch": 0.107776, + "grad_norm": 1.2734375, + "learning_rate": 8.994354838709678e-05, + "loss": 0.1554, + "step": 6736 + }, + { + "epoch": 0.107792, + "grad_norm": 1.2890625, + "learning_rate": 8.994193548387098e-05, + "loss": 0.1946, + "step": 6737 + }, + { + "epoch": 0.107808, + "grad_norm": 0.81640625, + "learning_rate": 8.994032258064516e-05, + "loss": 0.168, + "step": 6738 + }, + { + "epoch": 0.107824, + "grad_norm": 1.140625, + "learning_rate": 8.993870967741936e-05, + "loss": 0.1986, + "step": 6739 + }, + { + "epoch": 0.10784, + "grad_norm": 0.90625, + "learning_rate": 8.993709677419355e-05, + "loss": 0.174, + "step": 6740 + }, + { + "epoch": 0.107856, + "grad_norm": 0.828125, + "learning_rate": 8.993548387096775e-05, + "loss": 0.1635, + "step": 6741 + }, + { + "epoch": 0.107872, + "grad_norm": 0.67578125, + "learning_rate": 8.993387096774195e-05, + "loss": 0.1826, + "step": 6742 + }, + { + "epoch": 0.107888, + "grad_norm": 0.81640625, + "learning_rate": 8.993225806451613e-05, + "loss": 0.1591, + "step": 6743 + }, + { + "epoch": 0.107904, + "grad_norm": 1.3828125, + "learning_rate": 8.993064516129033e-05, + "loss": 0.1588, + "step": 6744 + }, + { + "epoch": 0.10792, + "grad_norm": 0.6640625, + "learning_rate": 8.992903225806452e-05, + "loss": 0.1717, + "step": 6745 + }, + { + "epoch": 0.107936, + "grad_norm": 0.76953125, + "learning_rate": 8.992741935483872e-05, + "loss": 0.1867, + "step": 6746 + }, + { + "epoch": 0.107952, + "grad_norm": 0.796875, + "learning_rate": 8.99258064516129e-05, + "loss": 0.1954, + "step": 6747 + }, + { + "epoch": 0.107968, + "grad_norm": 1.25, + "learning_rate": 8.99241935483871e-05, + "loss": 0.1852, + "step": 6748 + }, + { + "epoch": 0.107984, + "grad_norm": 0.7578125, + "learning_rate": 8.992258064516129e-05, + "loss": 0.1613, + "step": 6749 + }, + { + "epoch": 0.108, + "grad_norm": 0.80078125, + "learning_rate": 8.992096774193549e-05, + "loss": 0.2051, + "step": 6750 + }, + { + "epoch": 0.108016, + "grad_norm": 0.74609375, + "learning_rate": 8.991935483870968e-05, + "loss": 0.1816, + "step": 6751 + }, + { + "epoch": 0.108032, + "grad_norm": 0.64453125, + "learning_rate": 8.991774193548388e-05, + "loss": 0.161, + "step": 6752 + }, + { + "epoch": 0.108048, + "grad_norm": 0.8359375, + "learning_rate": 8.991612903225806e-05, + "loss": 0.1748, + "step": 6753 + }, + { + "epoch": 0.108064, + "grad_norm": 1.0859375, + "learning_rate": 8.991451612903226e-05, + "loss": 0.1588, + "step": 6754 + }, + { + "epoch": 0.10808, + "grad_norm": 0.94921875, + "learning_rate": 8.991290322580646e-05, + "loss": 0.1852, + "step": 6755 + }, + { + "epoch": 0.108096, + "grad_norm": 0.65625, + "learning_rate": 8.991129032258066e-05, + "loss": 0.1249, + "step": 6756 + }, + { + "epoch": 0.108112, + "grad_norm": 0.875, + "learning_rate": 8.990967741935485e-05, + "loss": 0.1631, + "step": 6757 + }, + { + "epoch": 0.108128, + "grad_norm": 0.71484375, + "learning_rate": 8.990806451612903e-05, + "loss": 0.1748, + "step": 6758 + }, + { + "epoch": 0.108144, + "grad_norm": 1.5, + "learning_rate": 8.990645161290323e-05, + "loss": 0.202, + "step": 6759 + }, + { + "epoch": 0.10816, + "grad_norm": 0.57421875, + "learning_rate": 8.990483870967742e-05, + "loss": 0.2047, + "step": 6760 + }, + { + "epoch": 0.108176, + "grad_norm": 0.734375, + "learning_rate": 8.990322580645162e-05, + "loss": 0.1561, + "step": 6761 + }, + { + "epoch": 0.108192, + "grad_norm": 0.74609375, + "learning_rate": 8.99016129032258e-05, + "loss": 0.1831, + "step": 6762 + }, + { + "epoch": 0.108208, + "grad_norm": 0.58203125, + "learning_rate": 8.99e-05, + "loss": 0.1661, + "step": 6763 + }, + { + "epoch": 0.108224, + "grad_norm": 0.66015625, + "learning_rate": 8.989838709677419e-05, + "loss": 0.1793, + "step": 6764 + }, + { + "epoch": 0.10824, + "grad_norm": 0.93359375, + "learning_rate": 8.989677419354839e-05, + "loss": 0.1825, + "step": 6765 + }, + { + "epoch": 0.108256, + "grad_norm": 0.8359375, + "learning_rate": 8.989516129032259e-05, + "loss": 0.1632, + "step": 6766 + }, + { + "epoch": 0.108272, + "grad_norm": 0.984375, + "learning_rate": 8.989354838709679e-05, + "loss": 0.1971, + "step": 6767 + }, + { + "epoch": 0.108288, + "grad_norm": 1.0, + "learning_rate": 8.989193548387097e-05, + "loss": 0.2028, + "step": 6768 + }, + { + "epoch": 0.108304, + "grad_norm": 0.59375, + "learning_rate": 8.989032258064517e-05, + "loss": 0.1679, + "step": 6769 + }, + { + "epoch": 0.10832, + "grad_norm": 0.875, + "learning_rate": 8.988870967741936e-05, + "loss": 0.1628, + "step": 6770 + }, + { + "epoch": 0.108336, + "grad_norm": 0.66015625, + "learning_rate": 8.988709677419356e-05, + "loss": 0.153, + "step": 6771 + }, + { + "epoch": 0.108352, + "grad_norm": 0.93359375, + "learning_rate": 8.988548387096775e-05, + "loss": 0.2071, + "step": 6772 + }, + { + "epoch": 0.108368, + "grad_norm": 0.68359375, + "learning_rate": 8.988387096774193e-05, + "loss": 0.1978, + "step": 6773 + }, + { + "epoch": 0.108384, + "grad_norm": 0.8671875, + "learning_rate": 8.988225806451613e-05, + "loss": 0.1857, + "step": 6774 + }, + { + "epoch": 0.1084, + "grad_norm": 0.875, + "learning_rate": 8.988064516129032e-05, + "loss": 0.1819, + "step": 6775 + }, + { + "epoch": 0.108416, + "grad_norm": 0.92578125, + "learning_rate": 8.987903225806452e-05, + "loss": 0.1563, + "step": 6776 + }, + { + "epoch": 0.108432, + "grad_norm": 0.734375, + "learning_rate": 8.987741935483872e-05, + "loss": 0.1825, + "step": 6777 + }, + { + "epoch": 0.108448, + "grad_norm": 0.7890625, + "learning_rate": 8.98758064516129e-05, + "loss": 0.151, + "step": 6778 + }, + { + "epoch": 0.108464, + "grad_norm": 1.0859375, + "learning_rate": 8.98741935483871e-05, + "loss": 0.1682, + "step": 6779 + }, + { + "epoch": 0.10848, + "grad_norm": 0.67578125, + "learning_rate": 8.98725806451613e-05, + "loss": 0.1785, + "step": 6780 + }, + { + "epoch": 0.108496, + "grad_norm": 0.91015625, + "learning_rate": 8.987096774193549e-05, + "loss": 0.222, + "step": 6781 + }, + { + "epoch": 0.108512, + "grad_norm": 0.75390625, + "learning_rate": 8.986935483870969e-05, + "loss": 0.2143, + "step": 6782 + }, + { + "epoch": 0.108528, + "grad_norm": 0.73046875, + "learning_rate": 8.986774193548387e-05, + "loss": 0.1739, + "step": 6783 + }, + { + "epoch": 0.108544, + "grad_norm": 0.65625, + "learning_rate": 8.986612903225807e-05, + "loss": 0.1682, + "step": 6784 + }, + { + "epoch": 0.10856, + "grad_norm": 0.84375, + "learning_rate": 8.986451612903226e-05, + "loss": 0.1604, + "step": 6785 + }, + { + "epoch": 0.108576, + "grad_norm": 1.0625, + "learning_rate": 8.986290322580646e-05, + "loss": 0.1288, + "step": 6786 + }, + { + "epoch": 0.108592, + "grad_norm": 0.8359375, + "learning_rate": 8.986129032258065e-05, + "loss": 0.1891, + "step": 6787 + }, + { + "epoch": 0.108608, + "grad_norm": 0.71484375, + "learning_rate": 8.985967741935484e-05, + "loss": 0.2031, + "step": 6788 + }, + { + "epoch": 0.108624, + "grad_norm": 0.82421875, + "learning_rate": 8.985806451612903e-05, + "loss": 0.1586, + "step": 6789 + }, + { + "epoch": 0.10864, + "grad_norm": 1.7734375, + "learning_rate": 8.985645161290323e-05, + "loss": 0.2179, + "step": 6790 + }, + { + "epoch": 0.108656, + "grad_norm": 1.78125, + "learning_rate": 8.985483870967743e-05, + "loss": 0.2045, + "step": 6791 + }, + { + "epoch": 0.108672, + "grad_norm": 1.03125, + "learning_rate": 8.985322580645162e-05, + "loss": 0.1595, + "step": 6792 + }, + { + "epoch": 0.108688, + "grad_norm": 0.796875, + "learning_rate": 8.985161290322582e-05, + "loss": 0.1584, + "step": 6793 + }, + { + "epoch": 0.108704, + "grad_norm": 0.52734375, + "learning_rate": 8.985e-05, + "loss": 0.1612, + "step": 6794 + }, + { + "epoch": 0.10872, + "grad_norm": 0.8125, + "learning_rate": 8.98483870967742e-05, + "loss": 0.1968, + "step": 6795 + }, + { + "epoch": 0.108736, + "grad_norm": 0.70703125, + "learning_rate": 8.984677419354839e-05, + "loss": 0.1521, + "step": 6796 + }, + { + "epoch": 0.108752, + "grad_norm": 0.5703125, + "learning_rate": 8.984516129032259e-05, + "loss": 0.1434, + "step": 6797 + }, + { + "epoch": 0.108768, + "grad_norm": 1.390625, + "learning_rate": 8.984354838709677e-05, + "loss": 0.1823, + "step": 6798 + }, + { + "epoch": 0.108784, + "grad_norm": 1.1640625, + "learning_rate": 8.984193548387097e-05, + "loss": 0.1989, + "step": 6799 + }, + { + "epoch": 0.1088, + "grad_norm": 1.328125, + "learning_rate": 8.984032258064516e-05, + "loss": 0.2484, + "step": 6800 + }, + { + "epoch": 0.108816, + "grad_norm": 1.2890625, + "learning_rate": 8.983870967741936e-05, + "loss": 0.1844, + "step": 6801 + }, + { + "epoch": 0.108832, + "grad_norm": 0.546875, + "learning_rate": 8.983709677419356e-05, + "loss": 0.1452, + "step": 6802 + }, + { + "epoch": 0.108848, + "grad_norm": 0.91796875, + "learning_rate": 8.983548387096776e-05, + "loss": 0.1398, + "step": 6803 + }, + { + "epoch": 0.108864, + "grad_norm": 1.15625, + "learning_rate": 8.983387096774194e-05, + "loss": 0.1459, + "step": 6804 + }, + { + "epoch": 0.10888, + "grad_norm": 1.109375, + "learning_rate": 8.983225806451613e-05, + "loss": 0.1949, + "step": 6805 + }, + { + "epoch": 0.108896, + "grad_norm": 0.5859375, + "learning_rate": 8.983064516129033e-05, + "loss": 0.174, + "step": 6806 + }, + { + "epoch": 0.108912, + "grad_norm": 0.77734375, + "learning_rate": 8.982903225806452e-05, + "loss": 0.1578, + "step": 6807 + }, + { + "epoch": 0.108928, + "grad_norm": 1.0859375, + "learning_rate": 8.982741935483872e-05, + "loss": 0.1716, + "step": 6808 + }, + { + "epoch": 0.108944, + "grad_norm": 0.74609375, + "learning_rate": 8.98258064516129e-05, + "loss": 0.185, + "step": 6809 + }, + { + "epoch": 0.10896, + "grad_norm": 0.9140625, + "learning_rate": 8.98241935483871e-05, + "loss": 0.1935, + "step": 6810 + }, + { + "epoch": 0.108976, + "grad_norm": 1.0625, + "learning_rate": 8.982258064516129e-05, + "loss": 0.2096, + "step": 6811 + }, + { + "epoch": 0.108992, + "grad_norm": 0.69921875, + "learning_rate": 8.982096774193549e-05, + "loss": 0.1773, + "step": 6812 + }, + { + "epoch": 0.109008, + "grad_norm": 0.890625, + "learning_rate": 8.981935483870967e-05, + "loss": 0.1907, + "step": 6813 + }, + { + "epoch": 0.109024, + "grad_norm": 0.97265625, + "learning_rate": 8.981774193548387e-05, + "loss": 0.1907, + "step": 6814 + }, + { + "epoch": 0.10904, + "grad_norm": 1.1796875, + "learning_rate": 8.981612903225807e-05, + "loss": 0.2201, + "step": 6815 + }, + { + "epoch": 0.109056, + "grad_norm": 0.7421875, + "learning_rate": 8.981451612903227e-05, + "loss": 0.1786, + "step": 6816 + }, + { + "epoch": 0.109072, + "grad_norm": 0.6796875, + "learning_rate": 8.981290322580646e-05, + "loss": 0.1611, + "step": 6817 + }, + { + "epoch": 0.109088, + "grad_norm": 0.5703125, + "learning_rate": 8.981129032258066e-05, + "loss": 0.1538, + "step": 6818 + }, + { + "epoch": 0.109104, + "grad_norm": 0.78515625, + "learning_rate": 8.980967741935484e-05, + "loss": 0.1463, + "step": 6819 + }, + { + "epoch": 0.10912, + "grad_norm": 0.6796875, + "learning_rate": 8.980806451612903e-05, + "loss": 0.1994, + "step": 6820 + }, + { + "epoch": 0.109136, + "grad_norm": 0.80078125, + "learning_rate": 8.980645161290323e-05, + "loss": 0.1662, + "step": 6821 + }, + { + "epoch": 0.109152, + "grad_norm": 0.65234375, + "learning_rate": 8.980483870967742e-05, + "loss": 0.2195, + "step": 6822 + }, + { + "epoch": 0.109168, + "grad_norm": 0.671875, + "learning_rate": 8.980322580645162e-05, + "loss": 0.1473, + "step": 6823 + }, + { + "epoch": 0.109184, + "grad_norm": 1.0078125, + "learning_rate": 8.98016129032258e-05, + "loss": 0.1741, + "step": 6824 + }, + { + "epoch": 0.1092, + "grad_norm": 1.3984375, + "learning_rate": 8.98e-05, + "loss": 0.1692, + "step": 6825 + }, + { + "epoch": 0.109216, + "grad_norm": 0.7890625, + "learning_rate": 8.97983870967742e-05, + "loss": 0.1871, + "step": 6826 + }, + { + "epoch": 0.109232, + "grad_norm": 1.0703125, + "learning_rate": 8.97967741935484e-05, + "loss": 0.2088, + "step": 6827 + }, + { + "epoch": 0.109248, + "grad_norm": 0.6953125, + "learning_rate": 8.979516129032259e-05, + "loss": 0.1753, + "step": 6828 + }, + { + "epoch": 0.109264, + "grad_norm": 0.7578125, + "learning_rate": 8.979354838709679e-05, + "loss": 0.2345, + "step": 6829 + }, + { + "epoch": 0.10928, + "grad_norm": 0.77734375, + "learning_rate": 8.979193548387097e-05, + "loss": 0.181, + "step": 6830 + }, + { + "epoch": 0.109296, + "grad_norm": 1.1640625, + "learning_rate": 8.979032258064517e-05, + "loss": 0.1892, + "step": 6831 + }, + { + "epoch": 0.109312, + "grad_norm": 0.9453125, + "learning_rate": 8.978870967741936e-05, + "loss": 0.1748, + "step": 6832 + }, + { + "epoch": 0.109328, + "grad_norm": 0.7578125, + "learning_rate": 8.978709677419356e-05, + "loss": 0.1691, + "step": 6833 + }, + { + "epoch": 0.109344, + "grad_norm": 1.453125, + "learning_rate": 8.978548387096774e-05, + "loss": 0.2418, + "step": 6834 + }, + { + "epoch": 0.10936, + "grad_norm": 1.1875, + "learning_rate": 8.978387096774193e-05, + "loss": 0.204, + "step": 6835 + }, + { + "epoch": 0.109376, + "grad_norm": 0.953125, + "learning_rate": 8.978225806451613e-05, + "loss": 0.1721, + "step": 6836 + }, + { + "epoch": 0.109392, + "grad_norm": 0.73046875, + "learning_rate": 8.978064516129033e-05, + "loss": 0.1778, + "step": 6837 + }, + { + "epoch": 0.109408, + "grad_norm": 0.8203125, + "learning_rate": 8.977903225806453e-05, + "loss": 0.199, + "step": 6838 + }, + { + "epoch": 0.109424, + "grad_norm": 1.0625, + "learning_rate": 8.977741935483871e-05, + "loss": 0.1766, + "step": 6839 + }, + { + "epoch": 0.10944, + "grad_norm": 0.78515625, + "learning_rate": 8.977580645161291e-05, + "loss": 0.2033, + "step": 6840 + }, + { + "epoch": 0.109456, + "grad_norm": 0.640625, + "learning_rate": 8.97741935483871e-05, + "loss": 0.1751, + "step": 6841 + }, + { + "epoch": 0.109472, + "grad_norm": 1.265625, + "learning_rate": 8.97725806451613e-05, + "loss": 0.1663, + "step": 6842 + }, + { + "epoch": 0.109488, + "grad_norm": 1.15625, + "learning_rate": 8.977096774193549e-05, + "loss": 0.1936, + "step": 6843 + }, + { + "epoch": 0.109504, + "grad_norm": 0.60546875, + "learning_rate": 8.976935483870969e-05, + "loss": 0.1812, + "step": 6844 + }, + { + "epoch": 0.10952, + "grad_norm": 0.8203125, + "learning_rate": 8.976774193548387e-05, + "loss": 0.2296, + "step": 6845 + }, + { + "epoch": 0.109536, + "grad_norm": 0.73046875, + "learning_rate": 8.976612903225807e-05, + "loss": 0.1786, + "step": 6846 + }, + { + "epoch": 0.109552, + "grad_norm": 0.5078125, + "learning_rate": 8.976451612903226e-05, + "loss": 0.1644, + "step": 6847 + }, + { + "epoch": 0.109568, + "grad_norm": 0.81640625, + "learning_rate": 8.976290322580646e-05, + "loss": 0.1904, + "step": 6848 + }, + { + "epoch": 0.109584, + "grad_norm": 0.8125, + "learning_rate": 8.976129032258064e-05, + "loss": 0.171, + "step": 6849 + }, + { + "epoch": 0.1096, + "grad_norm": 1.078125, + "learning_rate": 8.975967741935484e-05, + "loss": 0.1897, + "step": 6850 + }, + { + "epoch": 0.109616, + "grad_norm": 0.59765625, + "learning_rate": 8.975806451612904e-05, + "loss": 0.1563, + "step": 6851 + }, + { + "epoch": 0.109632, + "grad_norm": 0.84765625, + "learning_rate": 8.975645161290323e-05, + "loss": 0.1713, + "step": 6852 + }, + { + "epoch": 0.109648, + "grad_norm": 1.046875, + "learning_rate": 8.975483870967743e-05, + "loss": 0.176, + "step": 6853 + }, + { + "epoch": 0.109664, + "grad_norm": 1.140625, + "learning_rate": 8.975322580645161e-05, + "loss": 0.225, + "step": 6854 + }, + { + "epoch": 0.10968, + "grad_norm": 0.50390625, + "learning_rate": 8.975161290322581e-05, + "loss": 0.1397, + "step": 6855 + }, + { + "epoch": 0.109696, + "grad_norm": 1.15625, + "learning_rate": 8.975e-05, + "loss": 0.1628, + "step": 6856 + }, + { + "epoch": 0.109712, + "grad_norm": 0.65625, + "learning_rate": 8.97483870967742e-05, + "loss": 0.1909, + "step": 6857 + }, + { + "epoch": 0.109728, + "grad_norm": 1.4296875, + "learning_rate": 8.974677419354839e-05, + "loss": 0.1845, + "step": 6858 + }, + { + "epoch": 0.109744, + "grad_norm": 0.80859375, + "learning_rate": 8.974516129032258e-05, + "loss": 0.165, + "step": 6859 + }, + { + "epoch": 0.10976, + "grad_norm": 1.0546875, + "learning_rate": 8.974354838709677e-05, + "loss": 0.1975, + "step": 6860 + }, + { + "epoch": 0.109776, + "grad_norm": 0.55859375, + "learning_rate": 8.974193548387097e-05, + "loss": 0.1633, + "step": 6861 + }, + { + "epoch": 0.109792, + "grad_norm": 0.61328125, + "learning_rate": 8.974032258064517e-05, + "loss": 0.1473, + "step": 6862 + }, + { + "epoch": 0.109808, + "grad_norm": 0.75390625, + "learning_rate": 8.973870967741937e-05, + "loss": 0.2226, + "step": 6863 + }, + { + "epoch": 0.109824, + "grad_norm": 0.75, + "learning_rate": 8.973709677419356e-05, + "loss": 0.1853, + "step": 6864 + }, + { + "epoch": 0.10984, + "grad_norm": 0.83984375, + "learning_rate": 8.973548387096776e-05, + "loss": 0.1655, + "step": 6865 + }, + { + "epoch": 0.109856, + "grad_norm": 0.6015625, + "learning_rate": 8.973387096774194e-05, + "loss": 0.2008, + "step": 6866 + }, + { + "epoch": 0.109872, + "grad_norm": 0.53515625, + "learning_rate": 8.973225806451613e-05, + "loss": 0.1339, + "step": 6867 + }, + { + "epoch": 0.109888, + "grad_norm": 0.72265625, + "learning_rate": 8.973064516129033e-05, + "loss": 0.1922, + "step": 6868 + }, + { + "epoch": 0.109904, + "grad_norm": 1.09375, + "learning_rate": 8.972903225806451e-05, + "loss": 0.1933, + "step": 6869 + }, + { + "epoch": 0.10992, + "grad_norm": 0.72265625, + "learning_rate": 8.972741935483871e-05, + "loss": 0.1709, + "step": 6870 + }, + { + "epoch": 0.109936, + "grad_norm": 0.7109375, + "learning_rate": 8.97258064516129e-05, + "loss": 0.1955, + "step": 6871 + }, + { + "epoch": 0.109952, + "grad_norm": 0.68359375, + "learning_rate": 8.97241935483871e-05, + "loss": 0.1647, + "step": 6872 + }, + { + "epoch": 0.109968, + "grad_norm": 0.921875, + "learning_rate": 8.972258064516128e-05, + "loss": 0.1613, + "step": 6873 + }, + { + "epoch": 0.109984, + "grad_norm": 0.7578125, + "learning_rate": 8.972096774193548e-05, + "loss": 0.1897, + "step": 6874 + }, + { + "epoch": 0.11, + "grad_norm": 0.8203125, + "learning_rate": 8.971935483870968e-05, + "loss": 0.2076, + "step": 6875 + }, + { + "epoch": 0.110016, + "grad_norm": 0.74609375, + "learning_rate": 8.971774193548388e-05, + "loss": 0.1945, + "step": 6876 + }, + { + "epoch": 0.110032, + "grad_norm": 0.66796875, + "learning_rate": 8.971612903225807e-05, + "loss": 0.193, + "step": 6877 + }, + { + "epoch": 0.110048, + "grad_norm": 0.79296875, + "learning_rate": 8.971451612903227e-05, + "loss": 0.1806, + "step": 6878 + }, + { + "epoch": 0.110064, + "grad_norm": 0.5078125, + "learning_rate": 8.971290322580646e-05, + "loss": 0.1135, + "step": 6879 + }, + { + "epoch": 0.11008, + "grad_norm": 1.046875, + "learning_rate": 8.971129032258066e-05, + "loss": 0.2066, + "step": 6880 + }, + { + "epoch": 0.110096, + "grad_norm": 0.765625, + "learning_rate": 8.970967741935484e-05, + "loss": 0.1355, + "step": 6881 + }, + { + "epoch": 0.110112, + "grad_norm": 0.7421875, + "learning_rate": 8.970806451612903e-05, + "loss": 0.203, + "step": 6882 + }, + { + "epoch": 0.110128, + "grad_norm": 0.94921875, + "learning_rate": 8.970645161290323e-05, + "loss": 0.2134, + "step": 6883 + }, + { + "epoch": 0.110144, + "grad_norm": 1.1328125, + "learning_rate": 8.970483870967741e-05, + "loss": 0.2113, + "step": 6884 + }, + { + "epoch": 0.11016, + "grad_norm": 0.73046875, + "learning_rate": 8.970322580645161e-05, + "loss": 0.1654, + "step": 6885 + }, + { + "epoch": 0.110176, + "grad_norm": 0.6484375, + "learning_rate": 8.970161290322581e-05, + "loss": 0.1708, + "step": 6886 + }, + { + "epoch": 0.110192, + "grad_norm": 0.6328125, + "learning_rate": 8.970000000000001e-05, + "loss": 0.1701, + "step": 6887 + }, + { + "epoch": 0.110208, + "grad_norm": 0.91796875, + "learning_rate": 8.96983870967742e-05, + "loss": 0.1975, + "step": 6888 + }, + { + "epoch": 0.110224, + "grad_norm": 1.015625, + "learning_rate": 8.96967741935484e-05, + "loss": 0.2004, + "step": 6889 + }, + { + "epoch": 0.11024, + "grad_norm": 0.75390625, + "learning_rate": 8.969516129032258e-05, + "loss": 0.1573, + "step": 6890 + }, + { + "epoch": 0.110256, + "grad_norm": 1.1015625, + "learning_rate": 8.969354838709678e-05, + "loss": 0.1914, + "step": 6891 + }, + { + "epoch": 0.110272, + "grad_norm": 1.0, + "learning_rate": 8.969193548387097e-05, + "loss": 0.1934, + "step": 6892 + }, + { + "epoch": 0.110288, + "grad_norm": 0.72265625, + "learning_rate": 8.969032258064517e-05, + "loss": 0.1497, + "step": 6893 + }, + { + "epoch": 0.110304, + "grad_norm": 0.8203125, + "learning_rate": 8.968870967741936e-05, + "loss": 0.1849, + "step": 6894 + }, + { + "epoch": 0.11032, + "grad_norm": 1.0703125, + "learning_rate": 8.968709677419355e-05, + "loss": 0.1432, + "step": 6895 + }, + { + "epoch": 0.110336, + "grad_norm": 0.8828125, + "learning_rate": 8.968548387096774e-05, + "loss": 0.1561, + "step": 6896 + }, + { + "epoch": 0.110352, + "grad_norm": 0.73046875, + "learning_rate": 8.968387096774194e-05, + "loss": 0.197, + "step": 6897 + }, + { + "epoch": 0.110368, + "grad_norm": 0.671875, + "learning_rate": 8.968225806451614e-05, + "loss": 0.1485, + "step": 6898 + }, + { + "epoch": 0.110384, + "grad_norm": 0.79296875, + "learning_rate": 8.968064516129033e-05, + "loss": 0.1656, + "step": 6899 + }, + { + "epoch": 0.1104, + "grad_norm": 1.7578125, + "learning_rate": 8.967903225806453e-05, + "loss": 0.2083, + "step": 6900 + }, + { + "epoch": 0.110416, + "grad_norm": 0.53125, + "learning_rate": 8.967741935483871e-05, + "loss": 0.1289, + "step": 6901 + }, + { + "epoch": 0.110432, + "grad_norm": 0.87109375, + "learning_rate": 8.967580645161291e-05, + "loss": 0.1888, + "step": 6902 + }, + { + "epoch": 0.110448, + "grad_norm": 1.015625, + "learning_rate": 8.96741935483871e-05, + "loss": 0.2085, + "step": 6903 + }, + { + "epoch": 0.110464, + "grad_norm": 0.984375, + "learning_rate": 8.96725806451613e-05, + "loss": 0.1459, + "step": 6904 + }, + { + "epoch": 0.11048, + "grad_norm": 0.8984375, + "learning_rate": 8.967096774193548e-05, + "loss": 0.154, + "step": 6905 + }, + { + "epoch": 0.110496, + "grad_norm": 0.9140625, + "learning_rate": 8.966935483870968e-05, + "loss": 0.1363, + "step": 6906 + }, + { + "epoch": 0.110512, + "grad_norm": 0.73046875, + "learning_rate": 8.966774193548387e-05, + "loss": 0.2047, + "step": 6907 + }, + { + "epoch": 0.110528, + "grad_norm": 0.765625, + "learning_rate": 8.966612903225807e-05, + "loss": 0.1941, + "step": 6908 + }, + { + "epoch": 0.110544, + "grad_norm": 1.296875, + "learning_rate": 8.966451612903225e-05, + "loss": 0.1766, + "step": 6909 + }, + { + "epoch": 0.11056, + "grad_norm": 0.90625, + "learning_rate": 8.966290322580645e-05, + "loss": 0.1837, + "step": 6910 + }, + { + "epoch": 0.110576, + "grad_norm": 0.984375, + "learning_rate": 8.966129032258065e-05, + "loss": 0.1802, + "step": 6911 + }, + { + "epoch": 0.110592, + "grad_norm": 0.9453125, + "learning_rate": 8.965967741935485e-05, + "loss": 0.1663, + "step": 6912 + }, + { + "epoch": 0.110608, + "grad_norm": 0.578125, + "learning_rate": 8.965806451612904e-05, + "loss": 0.1717, + "step": 6913 + }, + { + "epoch": 0.110624, + "grad_norm": 0.5625, + "learning_rate": 8.965645161290323e-05, + "loss": 0.1795, + "step": 6914 + }, + { + "epoch": 0.11064, + "grad_norm": 1.1796875, + "learning_rate": 8.965483870967743e-05, + "loss": 0.192, + "step": 6915 + }, + { + "epoch": 0.110656, + "grad_norm": 0.8984375, + "learning_rate": 8.965322580645161e-05, + "loss": 0.2337, + "step": 6916 + }, + { + "epoch": 0.110672, + "grad_norm": 0.6640625, + "learning_rate": 8.965161290322581e-05, + "loss": 0.1748, + "step": 6917 + }, + { + "epoch": 0.110688, + "grad_norm": 0.8125, + "learning_rate": 8.965e-05, + "loss": 0.1445, + "step": 6918 + }, + { + "epoch": 0.110704, + "grad_norm": 0.51171875, + "learning_rate": 8.96483870967742e-05, + "loss": 0.161, + "step": 6919 + }, + { + "epoch": 0.11072, + "grad_norm": 1.15625, + "learning_rate": 8.964677419354838e-05, + "loss": 0.2044, + "step": 6920 + }, + { + "epoch": 0.110736, + "grad_norm": 0.73828125, + "learning_rate": 8.964516129032258e-05, + "loss": 0.1729, + "step": 6921 + }, + { + "epoch": 0.110752, + "grad_norm": 0.69140625, + "learning_rate": 8.964354838709678e-05, + "loss": 0.1649, + "step": 6922 + }, + { + "epoch": 0.110768, + "grad_norm": 0.55859375, + "learning_rate": 8.964193548387098e-05, + "loss": 0.1504, + "step": 6923 + }, + { + "epoch": 0.110784, + "grad_norm": 0.77734375, + "learning_rate": 8.964032258064517e-05, + "loss": 0.1713, + "step": 6924 + }, + { + "epoch": 0.1108, + "grad_norm": 0.90234375, + "learning_rate": 8.963870967741937e-05, + "loss": 0.2185, + "step": 6925 + }, + { + "epoch": 0.110816, + "grad_norm": 0.77734375, + "learning_rate": 8.963709677419355e-05, + "loss": 0.1975, + "step": 6926 + }, + { + "epoch": 0.110832, + "grad_norm": 0.88671875, + "learning_rate": 8.963548387096775e-05, + "loss": 0.1636, + "step": 6927 + }, + { + "epoch": 0.110848, + "grad_norm": 1.0234375, + "learning_rate": 8.963387096774194e-05, + "loss": 0.1657, + "step": 6928 + }, + { + "epoch": 0.110864, + "grad_norm": 0.7578125, + "learning_rate": 8.963225806451613e-05, + "loss": 0.1666, + "step": 6929 + }, + { + "epoch": 0.11088, + "grad_norm": 0.94921875, + "learning_rate": 8.963064516129032e-05, + "loss": 0.2165, + "step": 6930 + }, + { + "epoch": 0.110896, + "grad_norm": 0.7265625, + "learning_rate": 8.962903225806451e-05, + "loss": 0.1963, + "step": 6931 + }, + { + "epoch": 0.110912, + "grad_norm": 0.91796875, + "learning_rate": 8.962741935483871e-05, + "loss": 0.2068, + "step": 6932 + }, + { + "epoch": 0.110928, + "grad_norm": 0.68359375, + "learning_rate": 8.962580645161291e-05, + "loss": 0.193, + "step": 6933 + }, + { + "epoch": 0.110944, + "grad_norm": 0.87109375, + "learning_rate": 8.96241935483871e-05, + "loss": 0.193, + "step": 6934 + }, + { + "epoch": 0.11096, + "grad_norm": 0.4609375, + "learning_rate": 8.96225806451613e-05, + "loss": 0.1399, + "step": 6935 + }, + { + "epoch": 0.110976, + "grad_norm": 0.63671875, + "learning_rate": 8.96209677419355e-05, + "loss": 0.1644, + "step": 6936 + }, + { + "epoch": 0.110992, + "grad_norm": 0.8125, + "learning_rate": 8.961935483870968e-05, + "loss": 0.1749, + "step": 6937 + }, + { + "epoch": 0.111008, + "grad_norm": 0.96875, + "learning_rate": 8.961774193548388e-05, + "loss": 0.2324, + "step": 6938 + }, + { + "epoch": 0.111024, + "grad_norm": 0.7578125, + "learning_rate": 8.961612903225807e-05, + "loss": 0.1947, + "step": 6939 + }, + { + "epoch": 0.11104, + "grad_norm": 2.5625, + "learning_rate": 8.961451612903227e-05, + "loss": 0.2396, + "step": 6940 + }, + { + "epoch": 0.111056, + "grad_norm": 0.94140625, + "learning_rate": 8.961290322580645e-05, + "loss": 0.1743, + "step": 6941 + }, + { + "epoch": 0.111072, + "grad_norm": 1.0546875, + "learning_rate": 8.961129032258065e-05, + "loss": 0.1726, + "step": 6942 + }, + { + "epoch": 0.111088, + "grad_norm": 1.0546875, + "learning_rate": 8.960967741935484e-05, + "loss": 0.1661, + "step": 6943 + }, + { + "epoch": 0.111104, + "grad_norm": 1.328125, + "learning_rate": 8.960806451612902e-05, + "loss": 0.1948, + "step": 6944 + }, + { + "epoch": 0.11112, + "grad_norm": 0.65625, + "learning_rate": 8.960645161290322e-05, + "loss": 0.174, + "step": 6945 + }, + { + "epoch": 0.111136, + "grad_norm": 0.93359375, + "learning_rate": 8.960483870967742e-05, + "loss": 0.1544, + "step": 6946 + }, + { + "epoch": 0.111152, + "grad_norm": 0.62890625, + "learning_rate": 8.960322580645162e-05, + "loss": 0.1805, + "step": 6947 + }, + { + "epoch": 0.111168, + "grad_norm": 0.69140625, + "learning_rate": 8.960161290322581e-05, + "loss": 0.177, + "step": 6948 + }, + { + "epoch": 0.111184, + "grad_norm": 0.81640625, + "learning_rate": 8.960000000000001e-05, + "loss": 0.1812, + "step": 6949 + }, + { + "epoch": 0.1112, + "grad_norm": 0.875, + "learning_rate": 8.95983870967742e-05, + "loss": 0.1697, + "step": 6950 + }, + { + "epoch": 0.111216, + "grad_norm": 1.1640625, + "learning_rate": 8.95967741935484e-05, + "loss": 0.1829, + "step": 6951 + }, + { + "epoch": 0.111232, + "grad_norm": 0.703125, + "learning_rate": 8.959516129032258e-05, + "loss": 0.1583, + "step": 6952 + }, + { + "epoch": 0.111248, + "grad_norm": 1.0625, + "learning_rate": 8.959354838709678e-05, + "loss": 0.1822, + "step": 6953 + }, + { + "epoch": 0.111264, + "grad_norm": 0.9296875, + "learning_rate": 8.959193548387097e-05, + "loss": 0.1541, + "step": 6954 + }, + { + "epoch": 0.11128, + "grad_norm": 0.6484375, + "learning_rate": 8.959032258064517e-05, + "loss": 0.1684, + "step": 6955 + }, + { + "epoch": 0.111296, + "grad_norm": 0.8359375, + "learning_rate": 8.958870967741935e-05, + "loss": 0.1738, + "step": 6956 + }, + { + "epoch": 0.111312, + "grad_norm": 0.71484375, + "learning_rate": 8.958709677419355e-05, + "loss": 0.1802, + "step": 6957 + }, + { + "epoch": 0.111328, + "grad_norm": 0.84375, + "learning_rate": 8.958548387096775e-05, + "loss": 0.1908, + "step": 6958 + }, + { + "epoch": 0.111344, + "grad_norm": 0.78125, + "learning_rate": 8.958387096774195e-05, + "loss": 0.1879, + "step": 6959 + }, + { + "epoch": 0.11136, + "grad_norm": 1.0, + "learning_rate": 8.958225806451614e-05, + "loss": 0.2099, + "step": 6960 + }, + { + "epoch": 0.111376, + "grad_norm": 0.92578125, + "learning_rate": 8.958064516129032e-05, + "loss": 0.1907, + "step": 6961 + }, + { + "epoch": 0.111392, + "grad_norm": 0.625, + "learning_rate": 8.957903225806452e-05, + "loss": 0.16, + "step": 6962 + }, + { + "epoch": 0.111408, + "grad_norm": 1.390625, + "learning_rate": 8.957741935483871e-05, + "loss": 0.179, + "step": 6963 + }, + { + "epoch": 0.111424, + "grad_norm": 0.61328125, + "learning_rate": 8.957580645161291e-05, + "loss": 0.1417, + "step": 6964 + }, + { + "epoch": 0.11144, + "grad_norm": 0.6640625, + "learning_rate": 8.95741935483871e-05, + "loss": 0.1694, + "step": 6965 + }, + { + "epoch": 0.111456, + "grad_norm": 0.63671875, + "learning_rate": 8.95725806451613e-05, + "loss": 0.154, + "step": 6966 + }, + { + "epoch": 0.111472, + "grad_norm": 0.58984375, + "learning_rate": 8.957096774193548e-05, + "loss": 0.1711, + "step": 6967 + }, + { + "epoch": 0.111488, + "grad_norm": 0.984375, + "learning_rate": 8.956935483870968e-05, + "loss": 0.189, + "step": 6968 + }, + { + "epoch": 0.111504, + "grad_norm": 0.9140625, + "learning_rate": 8.956774193548387e-05, + "loss": 0.1905, + "step": 6969 + }, + { + "epoch": 0.11152, + "grad_norm": 0.86328125, + "learning_rate": 8.956612903225807e-05, + "loss": 0.1875, + "step": 6970 + }, + { + "epoch": 0.111536, + "grad_norm": 1.390625, + "learning_rate": 8.956451612903227e-05, + "loss": 0.1971, + "step": 6971 + }, + { + "epoch": 0.111552, + "grad_norm": 0.68359375, + "learning_rate": 8.956290322580647e-05, + "loss": 0.1659, + "step": 6972 + }, + { + "epoch": 0.111568, + "grad_norm": 0.462890625, + "learning_rate": 8.956129032258065e-05, + "loss": 0.148, + "step": 6973 + }, + { + "epoch": 0.111584, + "grad_norm": 0.7734375, + "learning_rate": 8.955967741935485e-05, + "loss": 0.2014, + "step": 6974 + }, + { + "epoch": 0.1116, + "grad_norm": 0.66796875, + "learning_rate": 8.955806451612904e-05, + "loss": 0.163, + "step": 6975 + }, + { + "epoch": 0.111616, + "grad_norm": 1.015625, + "learning_rate": 8.955645161290322e-05, + "loss": 0.1963, + "step": 6976 + }, + { + "epoch": 0.111632, + "grad_norm": 1.5390625, + "learning_rate": 8.955483870967742e-05, + "loss": 0.2143, + "step": 6977 + }, + { + "epoch": 0.111648, + "grad_norm": 0.75, + "learning_rate": 8.955322580645161e-05, + "loss": 0.1882, + "step": 6978 + }, + { + "epoch": 0.111664, + "grad_norm": 0.63671875, + "learning_rate": 8.955161290322581e-05, + "loss": 0.1521, + "step": 6979 + }, + { + "epoch": 0.11168, + "grad_norm": 0.578125, + "learning_rate": 8.955e-05, + "loss": 0.1923, + "step": 6980 + }, + { + "epoch": 0.111696, + "grad_norm": 0.828125, + "learning_rate": 8.95483870967742e-05, + "loss": 0.189, + "step": 6981 + }, + { + "epoch": 0.111712, + "grad_norm": 1.09375, + "learning_rate": 8.95467741935484e-05, + "loss": 0.195, + "step": 6982 + }, + { + "epoch": 0.111728, + "grad_norm": 0.8046875, + "learning_rate": 8.95451612903226e-05, + "loss": 0.1794, + "step": 6983 + }, + { + "epoch": 0.111744, + "grad_norm": 1.5390625, + "learning_rate": 8.954354838709678e-05, + "loss": 0.1687, + "step": 6984 + }, + { + "epoch": 0.11176, + "grad_norm": 0.76953125, + "learning_rate": 8.954193548387098e-05, + "loss": 0.1884, + "step": 6985 + }, + { + "epoch": 0.111776, + "grad_norm": 0.8359375, + "learning_rate": 8.954032258064517e-05, + "loss": 0.1997, + "step": 6986 + }, + { + "epoch": 0.111792, + "grad_norm": 1.4765625, + "learning_rate": 8.953870967741937e-05, + "loss": 0.2151, + "step": 6987 + }, + { + "epoch": 0.111808, + "grad_norm": 1.1015625, + "learning_rate": 8.953709677419355e-05, + "loss": 0.174, + "step": 6988 + }, + { + "epoch": 0.111824, + "grad_norm": 0.70703125, + "learning_rate": 8.953548387096775e-05, + "loss": 0.1678, + "step": 6989 + }, + { + "epoch": 0.11184, + "grad_norm": 0.96875, + "learning_rate": 8.953387096774194e-05, + "loss": 0.1472, + "step": 6990 + }, + { + "epoch": 0.111856, + "grad_norm": 1.0078125, + "learning_rate": 8.953225806451612e-05, + "loss": 0.142, + "step": 6991 + }, + { + "epoch": 0.111872, + "grad_norm": 1.03125, + "learning_rate": 8.953064516129032e-05, + "loss": 0.1743, + "step": 6992 + }, + { + "epoch": 0.111888, + "grad_norm": 0.96875, + "learning_rate": 8.952903225806452e-05, + "loss": 0.1898, + "step": 6993 + }, + { + "epoch": 0.111904, + "grad_norm": 1.2109375, + "learning_rate": 8.952741935483872e-05, + "loss": 0.2283, + "step": 6994 + }, + { + "epoch": 0.11192, + "grad_norm": 1.1015625, + "learning_rate": 8.952580645161291e-05, + "loss": 0.1857, + "step": 6995 + }, + { + "epoch": 0.111936, + "grad_norm": 0.7109375, + "learning_rate": 8.952419354838711e-05, + "loss": 0.1524, + "step": 6996 + }, + { + "epoch": 0.111952, + "grad_norm": 0.71484375, + "learning_rate": 8.95225806451613e-05, + "loss": 0.2097, + "step": 6997 + }, + { + "epoch": 0.111968, + "grad_norm": 0.55078125, + "learning_rate": 8.952096774193549e-05, + "loss": 0.1644, + "step": 6998 + }, + { + "epoch": 0.111984, + "grad_norm": 0.59375, + "learning_rate": 8.951935483870968e-05, + "loss": 0.145, + "step": 6999 + }, + { + "epoch": 0.112, + "grad_norm": 0.7734375, + "learning_rate": 8.951774193548388e-05, + "loss": 0.1918, + "step": 7000 + }, + { + "epoch": 0.112016, + "grad_norm": 0.96484375, + "learning_rate": 8.951612903225806e-05, + "loss": 0.1793, + "step": 7001 + }, + { + "epoch": 0.112032, + "grad_norm": 0.86328125, + "learning_rate": 8.951451612903226e-05, + "loss": 0.1403, + "step": 7002 + }, + { + "epoch": 0.112048, + "grad_norm": 1.1875, + "learning_rate": 8.951290322580645e-05, + "loss": 0.1959, + "step": 7003 + }, + { + "epoch": 0.112064, + "grad_norm": 1.5546875, + "learning_rate": 8.951129032258065e-05, + "loss": 0.1748, + "step": 7004 + }, + { + "epoch": 0.11208, + "grad_norm": 2.421875, + "learning_rate": 8.950967741935484e-05, + "loss": 0.1739, + "step": 7005 + }, + { + "epoch": 0.112096, + "grad_norm": 0.87890625, + "learning_rate": 8.950806451612904e-05, + "loss": 0.2037, + "step": 7006 + }, + { + "epoch": 0.112112, + "grad_norm": 0.8671875, + "learning_rate": 8.950645161290324e-05, + "loss": 0.2094, + "step": 7007 + }, + { + "epoch": 0.112128, + "grad_norm": 0.796875, + "learning_rate": 8.950483870967742e-05, + "loss": 0.1729, + "step": 7008 + }, + { + "epoch": 0.112144, + "grad_norm": 0.87109375, + "learning_rate": 8.950322580645162e-05, + "loss": 0.184, + "step": 7009 + }, + { + "epoch": 0.11216, + "grad_norm": 1.578125, + "learning_rate": 8.950161290322581e-05, + "loss": 0.1578, + "step": 7010 + }, + { + "epoch": 0.112176, + "grad_norm": 1.875, + "learning_rate": 8.950000000000001e-05, + "loss": 0.1822, + "step": 7011 + }, + { + "epoch": 0.112192, + "grad_norm": 1.0703125, + "learning_rate": 8.949838709677419e-05, + "loss": 0.1683, + "step": 7012 + }, + { + "epoch": 0.112208, + "grad_norm": 1.4765625, + "learning_rate": 8.949677419354839e-05, + "loss": 0.151, + "step": 7013 + }, + { + "epoch": 0.112224, + "grad_norm": 1.515625, + "learning_rate": 8.949516129032258e-05, + "loss": 0.1786, + "step": 7014 + }, + { + "epoch": 0.11224, + "grad_norm": 0.74609375, + "learning_rate": 8.949354838709678e-05, + "loss": 0.1681, + "step": 7015 + }, + { + "epoch": 0.112256, + "grad_norm": 1.2109375, + "learning_rate": 8.949193548387096e-05, + "loss": 0.1534, + "step": 7016 + }, + { + "epoch": 0.112272, + "grad_norm": 0.90234375, + "learning_rate": 8.949032258064516e-05, + "loss": 0.1657, + "step": 7017 + }, + { + "epoch": 0.112288, + "grad_norm": 1.3671875, + "learning_rate": 8.948870967741936e-05, + "loss": 0.1999, + "step": 7018 + }, + { + "epoch": 0.112304, + "grad_norm": 0.78125, + "learning_rate": 8.948709677419356e-05, + "loss": 0.1818, + "step": 7019 + }, + { + "epoch": 0.11232, + "grad_norm": 1.6328125, + "learning_rate": 8.948548387096775e-05, + "loss": 0.1789, + "step": 7020 + }, + { + "epoch": 0.112336, + "grad_norm": 1.0234375, + "learning_rate": 8.948387096774195e-05, + "loss": 0.1851, + "step": 7021 + }, + { + "epoch": 0.112352, + "grad_norm": 0.9296875, + "learning_rate": 8.948225806451614e-05, + "loss": 0.1552, + "step": 7022 + }, + { + "epoch": 0.112368, + "grad_norm": 0.92578125, + "learning_rate": 8.948064516129032e-05, + "loss": 0.1554, + "step": 7023 + }, + { + "epoch": 0.112384, + "grad_norm": 0.74609375, + "learning_rate": 8.947903225806452e-05, + "loss": 0.1357, + "step": 7024 + }, + { + "epoch": 0.1124, + "grad_norm": 0.67578125, + "learning_rate": 8.947741935483871e-05, + "loss": 0.1943, + "step": 7025 + }, + { + "epoch": 0.112416, + "grad_norm": 1.1015625, + "learning_rate": 8.94758064516129e-05, + "loss": 0.2256, + "step": 7026 + }, + { + "epoch": 0.112432, + "grad_norm": 0.51171875, + "learning_rate": 8.947419354838709e-05, + "loss": 0.1526, + "step": 7027 + }, + { + "epoch": 0.112448, + "grad_norm": 1.9921875, + "learning_rate": 8.947258064516129e-05, + "loss": 0.1914, + "step": 7028 + }, + { + "epoch": 0.112464, + "grad_norm": 0.59765625, + "learning_rate": 8.947096774193549e-05, + "loss": 0.1552, + "step": 7029 + }, + { + "epoch": 0.11248, + "grad_norm": 0.7109375, + "learning_rate": 8.946935483870968e-05, + "loss": 0.139, + "step": 7030 + }, + { + "epoch": 0.112496, + "grad_norm": 0.87109375, + "learning_rate": 8.946774193548388e-05, + "loss": 0.2127, + "step": 7031 + }, + { + "epoch": 0.112512, + "grad_norm": 1.0234375, + "learning_rate": 8.946612903225808e-05, + "loss": 0.1888, + "step": 7032 + }, + { + "epoch": 0.112528, + "grad_norm": 0.5625, + "learning_rate": 8.946451612903226e-05, + "loss": 0.1719, + "step": 7033 + }, + { + "epoch": 0.112544, + "grad_norm": 0.94921875, + "learning_rate": 8.946290322580646e-05, + "loss": 0.1843, + "step": 7034 + }, + { + "epoch": 0.11256, + "grad_norm": 1.015625, + "learning_rate": 8.946129032258065e-05, + "loss": 0.1795, + "step": 7035 + }, + { + "epoch": 0.112576, + "grad_norm": 0.9140625, + "learning_rate": 8.945967741935485e-05, + "loss": 0.2081, + "step": 7036 + }, + { + "epoch": 0.112592, + "grad_norm": 0.8203125, + "learning_rate": 8.945806451612903e-05, + "loss": 0.1913, + "step": 7037 + }, + { + "epoch": 0.112608, + "grad_norm": 1.046875, + "learning_rate": 8.945645161290322e-05, + "loss": 0.1947, + "step": 7038 + }, + { + "epoch": 0.112624, + "grad_norm": 0.84765625, + "learning_rate": 8.945483870967742e-05, + "loss": 0.1582, + "step": 7039 + }, + { + "epoch": 0.11264, + "grad_norm": 0.83984375, + "learning_rate": 8.94532258064516e-05, + "loss": 0.1512, + "step": 7040 + }, + { + "epoch": 0.112656, + "grad_norm": 0.62109375, + "learning_rate": 8.94516129032258e-05, + "loss": 0.1768, + "step": 7041 + }, + { + "epoch": 0.112672, + "grad_norm": 0.84765625, + "learning_rate": 8.945e-05, + "loss": 0.2232, + "step": 7042 + }, + { + "epoch": 0.112688, + "grad_norm": 0.953125, + "learning_rate": 8.94483870967742e-05, + "loss": 0.2216, + "step": 7043 + }, + { + "epoch": 0.112704, + "grad_norm": 1.0, + "learning_rate": 8.944677419354839e-05, + "loss": 0.2209, + "step": 7044 + }, + { + "epoch": 0.11272, + "grad_norm": 0.890625, + "learning_rate": 8.944516129032259e-05, + "loss": 0.2062, + "step": 7045 + }, + { + "epoch": 0.112736, + "grad_norm": 0.95703125, + "learning_rate": 8.944354838709678e-05, + "loss": 0.1858, + "step": 7046 + }, + { + "epoch": 0.112752, + "grad_norm": 0.703125, + "learning_rate": 8.944193548387098e-05, + "loss": 0.1502, + "step": 7047 + }, + { + "epoch": 0.112768, + "grad_norm": 1.3828125, + "learning_rate": 8.944032258064516e-05, + "loss": 0.1648, + "step": 7048 + }, + { + "epoch": 0.112784, + "grad_norm": 1.65625, + "learning_rate": 8.943870967741936e-05, + "loss": 0.2302, + "step": 7049 + }, + { + "epoch": 0.1128, + "grad_norm": 0.88671875, + "learning_rate": 8.943709677419355e-05, + "loss": 0.1947, + "step": 7050 + }, + { + "epoch": 0.112816, + "grad_norm": 0.8515625, + "learning_rate": 8.943548387096775e-05, + "loss": 0.1701, + "step": 7051 + }, + { + "epoch": 0.112832, + "grad_norm": 0.78515625, + "learning_rate": 8.943387096774193e-05, + "loss": 0.2338, + "step": 7052 + }, + { + "epoch": 0.112848, + "grad_norm": 0.71484375, + "learning_rate": 8.943225806451613e-05, + "loss": 0.2021, + "step": 7053 + }, + { + "epoch": 0.112864, + "grad_norm": 0.9453125, + "learning_rate": 8.943064516129033e-05, + "loss": 0.158, + "step": 7054 + }, + { + "epoch": 0.11288, + "grad_norm": 0.83984375, + "learning_rate": 8.942903225806452e-05, + "loss": 0.1996, + "step": 7055 + }, + { + "epoch": 0.112896, + "grad_norm": 1.015625, + "learning_rate": 8.942741935483872e-05, + "loss": 0.182, + "step": 7056 + }, + { + "epoch": 0.112912, + "grad_norm": 1.078125, + "learning_rate": 8.94258064516129e-05, + "loss": 0.1832, + "step": 7057 + }, + { + "epoch": 0.112928, + "grad_norm": 0.6328125, + "learning_rate": 8.94241935483871e-05, + "loss": 0.1998, + "step": 7058 + }, + { + "epoch": 0.112944, + "grad_norm": 0.984375, + "learning_rate": 8.942258064516129e-05, + "loss": 0.1691, + "step": 7059 + }, + { + "epoch": 0.11296, + "grad_norm": 0.76953125, + "learning_rate": 8.942096774193549e-05, + "loss": 0.1765, + "step": 7060 + }, + { + "epoch": 0.112976, + "grad_norm": 0.7421875, + "learning_rate": 8.941935483870968e-05, + "loss": 0.1824, + "step": 7061 + }, + { + "epoch": 0.112992, + "grad_norm": 0.71484375, + "learning_rate": 8.941774193548388e-05, + "loss": 0.151, + "step": 7062 + }, + { + "epoch": 0.113008, + "grad_norm": 0.69921875, + "learning_rate": 8.941612903225806e-05, + "loss": 0.2011, + "step": 7063 + }, + { + "epoch": 0.113024, + "grad_norm": 1.4453125, + "learning_rate": 8.941451612903226e-05, + "loss": 0.2159, + "step": 7064 + }, + { + "epoch": 0.11304, + "grad_norm": 0.65625, + "learning_rate": 8.941290322580645e-05, + "loss": 0.1896, + "step": 7065 + }, + { + "epoch": 0.113056, + "grad_norm": 0.72265625, + "learning_rate": 8.941129032258065e-05, + "loss": 0.1614, + "step": 7066 + }, + { + "epoch": 0.113072, + "grad_norm": 0.953125, + "learning_rate": 8.940967741935485e-05, + "loss": 0.1795, + "step": 7067 + }, + { + "epoch": 0.113088, + "grad_norm": 0.61328125, + "learning_rate": 8.940806451612905e-05, + "loss": 0.183, + "step": 7068 + }, + { + "epoch": 0.113104, + "grad_norm": 0.62109375, + "learning_rate": 8.940645161290323e-05, + "loss": 0.1821, + "step": 7069 + }, + { + "epoch": 0.11312, + "grad_norm": 0.765625, + "learning_rate": 8.940483870967742e-05, + "loss": 0.1953, + "step": 7070 + }, + { + "epoch": 0.113136, + "grad_norm": 0.94921875, + "learning_rate": 8.940322580645162e-05, + "loss": 0.1572, + "step": 7071 + }, + { + "epoch": 0.113152, + "grad_norm": 0.46875, + "learning_rate": 8.94016129032258e-05, + "loss": 0.1821, + "step": 7072 + }, + { + "epoch": 0.113168, + "grad_norm": 0.83984375, + "learning_rate": 8.94e-05, + "loss": 0.2009, + "step": 7073 + }, + { + "epoch": 0.113184, + "grad_norm": 0.73828125, + "learning_rate": 8.939838709677419e-05, + "loss": 0.1869, + "step": 7074 + }, + { + "epoch": 0.1132, + "grad_norm": 0.890625, + "learning_rate": 8.939677419354839e-05, + "loss": 0.1773, + "step": 7075 + }, + { + "epoch": 0.113216, + "grad_norm": 0.7578125, + "learning_rate": 8.939516129032258e-05, + "loss": 0.2172, + "step": 7076 + }, + { + "epoch": 0.113232, + "grad_norm": 0.609375, + "learning_rate": 8.939354838709678e-05, + "loss": 0.178, + "step": 7077 + }, + { + "epoch": 0.113248, + "grad_norm": 0.90625, + "learning_rate": 8.939193548387098e-05, + "loss": 0.1915, + "step": 7078 + }, + { + "epoch": 0.113264, + "grad_norm": 0.7890625, + "learning_rate": 8.939032258064518e-05, + "loss": 0.1643, + "step": 7079 + }, + { + "epoch": 0.11328, + "grad_norm": 0.77734375, + "learning_rate": 8.938870967741936e-05, + "loss": 0.1519, + "step": 7080 + }, + { + "epoch": 0.113296, + "grad_norm": 1.015625, + "learning_rate": 8.938709677419356e-05, + "loss": 0.1908, + "step": 7081 + }, + { + "epoch": 0.113312, + "grad_norm": 0.71484375, + "learning_rate": 8.938548387096775e-05, + "loss": 0.1764, + "step": 7082 + }, + { + "epoch": 0.113328, + "grad_norm": 1.09375, + "learning_rate": 8.938387096774195e-05, + "loss": 0.167, + "step": 7083 + }, + { + "epoch": 0.113344, + "grad_norm": 0.8359375, + "learning_rate": 8.938225806451613e-05, + "loss": 0.1415, + "step": 7084 + }, + { + "epoch": 0.11336, + "grad_norm": 0.66796875, + "learning_rate": 8.938064516129032e-05, + "loss": 0.1256, + "step": 7085 + }, + { + "epoch": 0.113376, + "grad_norm": 0.62890625, + "learning_rate": 8.937903225806452e-05, + "loss": 0.1365, + "step": 7086 + }, + { + "epoch": 0.113392, + "grad_norm": 0.52734375, + "learning_rate": 8.93774193548387e-05, + "loss": 0.1616, + "step": 7087 + }, + { + "epoch": 0.113408, + "grad_norm": 1.15625, + "learning_rate": 8.93758064516129e-05, + "loss": 0.1969, + "step": 7088 + }, + { + "epoch": 0.113424, + "grad_norm": 0.65625, + "learning_rate": 8.93741935483871e-05, + "loss": 0.2016, + "step": 7089 + }, + { + "epoch": 0.11344, + "grad_norm": 0.91796875, + "learning_rate": 8.93725806451613e-05, + "loss": 0.1637, + "step": 7090 + }, + { + "epoch": 0.113456, + "grad_norm": 1.0625, + "learning_rate": 8.937096774193549e-05, + "loss": 0.2007, + "step": 7091 + }, + { + "epoch": 0.113472, + "grad_norm": 1.0546875, + "learning_rate": 8.936935483870969e-05, + "loss": 0.1536, + "step": 7092 + }, + { + "epoch": 0.113488, + "grad_norm": 0.81640625, + "learning_rate": 8.936774193548388e-05, + "loss": 0.184, + "step": 7093 + }, + { + "epoch": 0.113504, + "grad_norm": 0.9140625, + "learning_rate": 8.936612903225807e-05, + "loss": 0.1961, + "step": 7094 + }, + { + "epoch": 0.11352, + "grad_norm": 0.99609375, + "learning_rate": 8.936451612903226e-05, + "loss": 0.1889, + "step": 7095 + }, + { + "epoch": 0.113536, + "grad_norm": 0.86328125, + "learning_rate": 8.936290322580646e-05, + "loss": 0.1721, + "step": 7096 + }, + { + "epoch": 0.113552, + "grad_norm": 0.71875, + "learning_rate": 8.936129032258065e-05, + "loss": 0.1718, + "step": 7097 + }, + { + "epoch": 0.113568, + "grad_norm": 1.1953125, + "learning_rate": 8.935967741935485e-05, + "loss": 0.2018, + "step": 7098 + }, + { + "epoch": 0.113584, + "grad_norm": 0.8203125, + "learning_rate": 8.935806451612903e-05, + "loss": 0.1553, + "step": 7099 + }, + { + "epoch": 0.1136, + "grad_norm": 0.68359375, + "learning_rate": 8.935645161290322e-05, + "loss": 0.1856, + "step": 7100 + }, + { + "epoch": 0.113616, + "grad_norm": 0.8125, + "learning_rate": 8.935483870967742e-05, + "loss": 0.1801, + "step": 7101 + }, + { + "epoch": 0.113632, + "grad_norm": 1.0625, + "learning_rate": 8.935322580645162e-05, + "loss": 0.212, + "step": 7102 + }, + { + "epoch": 0.113648, + "grad_norm": 1.7265625, + "learning_rate": 8.935161290322582e-05, + "loss": 0.1778, + "step": 7103 + }, + { + "epoch": 0.113664, + "grad_norm": 1.2421875, + "learning_rate": 8.935e-05, + "loss": 0.205, + "step": 7104 + }, + { + "epoch": 0.11368, + "grad_norm": 0.80078125, + "learning_rate": 8.93483870967742e-05, + "loss": 0.2084, + "step": 7105 + }, + { + "epoch": 0.113696, + "grad_norm": 0.59375, + "learning_rate": 8.934677419354839e-05, + "loss": 0.146, + "step": 7106 + }, + { + "epoch": 0.113712, + "grad_norm": 0.8515625, + "learning_rate": 8.934516129032259e-05, + "loss": 0.1927, + "step": 7107 + }, + { + "epoch": 0.113728, + "grad_norm": 0.546875, + "learning_rate": 8.934354838709677e-05, + "loss": 0.1724, + "step": 7108 + }, + { + "epoch": 0.113744, + "grad_norm": 0.79296875, + "learning_rate": 8.934193548387097e-05, + "loss": 0.1823, + "step": 7109 + }, + { + "epoch": 0.11376, + "grad_norm": 1.2109375, + "learning_rate": 8.934032258064516e-05, + "loss": 0.1483, + "step": 7110 + }, + { + "epoch": 0.113776, + "grad_norm": 0.65625, + "learning_rate": 8.933870967741936e-05, + "loss": 0.1944, + "step": 7111 + }, + { + "epoch": 0.113792, + "grad_norm": 0.6640625, + "learning_rate": 8.933709677419355e-05, + "loss": 0.1591, + "step": 7112 + }, + { + "epoch": 0.113808, + "grad_norm": 0.6171875, + "learning_rate": 8.933548387096775e-05, + "loss": 0.168, + "step": 7113 + }, + { + "epoch": 0.113824, + "grad_norm": 0.66796875, + "learning_rate": 8.933387096774195e-05, + "loss": 0.1478, + "step": 7114 + }, + { + "epoch": 0.11384, + "grad_norm": 0.96484375, + "learning_rate": 8.933225806451613e-05, + "loss": 0.1876, + "step": 7115 + }, + { + "epoch": 0.113856, + "grad_norm": 0.8203125, + "learning_rate": 8.933064516129033e-05, + "loss": 0.2037, + "step": 7116 + }, + { + "epoch": 0.113872, + "grad_norm": 0.69921875, + "learning_rate": 8.932903225806452e-05, + "loss": 0.1441, + "step": 7117 + }, + { + "epoch": 0.113888, + "grad_norm": 1.09375, + "learning_rate": 8.932741935483872e-05, + "loss": 0.2117, + "step": 7118 + }, + { + "epoch": 0.113904, + "grad_norm": 0.60546875, + "learning_rate": 8.93258064516129e-05, + "loss": 0.1666, + "step": 7119 + }, + { + "epoch": 0.11392, + "grad_norm": 0.62890625, + "learning_rate": 8.93241935483871e-05, + "loss": 0.1497, + "step": 7120 + }, + { + "epoch": 0.113936, + "grad_norm": 0.6953125, + "learning_rate": 8.932258064516129e-05, + "loss": 0.1925, + "step": 7121 + }, + { + "epoch": 0.113952, + "grad_norm": 2.125, + "learning_rate": 8.932096774193549e-05, + "loss": 0.2082, + "step": 7122 + }, + { + "epoch": 0.113968, + "grad_norm": 1.1640625, + "learning_rate": 8.931935483870967e-05, + "loss": 0.1503, + "step": 7123 + }, + { + "epoch": 0.113984, + "grad_norm": 1.0625, + "learning_rate": 8.931774193548387e-05, + "loss": 0.1957, + "step": 7124 + }, + { + "epoch": 0.114, + "grad_norm": 0.59765625, + "learning_rate": 8.931612903225806e-05, + "loss": 0.1664, + "step": 7125 + }, + { + "epoch": 0.114016, + "grad_norm": 0.62109375, + "learning_rate": 8.931451612903226e-05, + "loss": 0.1689, + "step": 7126 + }, + { + "epoch": 0.114032, + "grad_norm": 0.73828125, + "learning_rate": 8.931290322580646e-05, + "loss": 0.1742, + "step": 7127 + }, + { + "epoch": 0.114048, + "grad_norm": 1.0078125, + "learning_rate": 8.931129032258066e-05, + "loss": 0.2017, + "step": 7128 + }, + { + "epoch": 0.114064, + "grad_norm": 1.03125, + "learning_rate": 8.930967741935485e-05, + "loss": 0.1966, + "step": 7129 + }, + { + "epoch": 0.11408, + "grad_norm": 0.90234375, + "learning_rate": 8.930806451612904e-05, + "loss": 0.1661, + "step": 7130 + }, + { + "epoch": 0.114096, + "grad_norm": 0.64453125, + "learning_rate": 8.930645161290323e-05, + "loss": 0.1623, + "step": 7131 + }, + { + "epoch": 0.114112, + "grad_norm": 1.03125, + "learning_rate": 8.930483870967742e-05, + "loss": 0.1818, + "step": 7132 + }, + { + "epoch": 0.114128, + "grad_norm": 1.046875, + "learning_rate": 8.930322580645162e-05, + "loss": 0.1615, + "step": 7133 + }, + { + "epoch": 0.114144, + "grad_norm": 0.84375, + "learning_rate": 8.93016129032258e-05, + "loss": 0.1711, + "step": 7134 + }, + { + "epoch": 0.11416, + "grad_norm": 0.90234375, + "learning_rate": 8.93e-05, + "loss": 0.1804, + "step": 7135 + }, + { + "epoch": 0.114176, + "grad_norm": 1.421875, + "learning_rate": 8.929838709677419e-05, + "loss": 0.222, + "step": 7136 + }, + { + "epoch": 0.114192, + "grad_norm": 1.84375, + "learning_rate": 8.929677419354839e-05, + "loss": 0.2287, + "step": 7137 + }, + { + "epoch": 0.114208, + "grad_norm": 0.81640625, + "learning_rate": 8.929516129032259e-05, + "loss": 0.1902, + "step": 7138 + }, + { + "epoch": 0.114224, + "grad_norm": 0.59375, + "learning_rate": 8.929354838709679e-05, + "loss": 0.1753, + "step": 7139 + }, + { + "epoch": 0.11424, + "grad_norm": 0.75390625, + "learning_rate": 8.929193548387097e-05, + "loss": 0.1905, + "step": 7140 + }, + { + "epoch": 0.114256, + "grad_norm": 0.890625, + "learning_rate": 8.929032258064517e-05, + "loss": 0.1547, + "step": 7141 + }, + { + "epoch": 0.114272, + "grad_norm": 0.77734375, + "learning_rate": 8.928870967741936e-05, + "loss": 0.1859, + "step": 7142 + }, + { + "epoch": 0.114288, + "grad_norm": 1.9609375, + "learning_rate": 8.928709677419356e-05, + "loss": 0.1964, + "step": 7143 + }, + { + "epoch": 0.114304, + "grad_norm": 2.078125, + "learning_rate": 8.928548387096774e-05, + "loss": 0.1897, + "step": 7144 + }, + { + "epoch": 0.11432, + "grad_norm": 0.92578125, + "learning_rate": 8.928387096774194e-05, + "loss": 0.1594, + "step": 7145 + }, + { + "epoch": 0.114336, + "grad_norm": 1.0234375, + "learning_rate": 8.928225806451613e-05, + "loss": 0.1962, + "step": 7146 + }, + { + "epoch": 0.114352, + "grad_norm": 0.875, + "learning_rate": 8.928064516129032e-05, + "loss": 0.1553, + "step": 7147 + }, + { + "epoch": 0.114368, + "grad_norm": 0.953125, + "learning_rate": 8.927903225806452e-05, + "loss": 0.1542, + "step": 7148 + }, + { + "epoch": 0.114384, + "grad_norm": 0.9140625, + "learning_rate": 8.927741935483872e-05, + "loss": 0.1951, + "step": 7149 + }, + { + "epoch": 0.1144, + "grad_norm": 1.125, + "learning_rate": 8.927580645161292e-05, + "loss": 0.2121, + "step": 7150 + }, + { + "epoch": 0.114416, + "grad_norm": 0.9296875, + "learning_rate": 8.92741935483871e-05, + "loss": 0.1528, + "step": 7151 + }, + { + "epoch": 0.114432, + "grad_norm": 0.93359375, + "learning_rate": 8.92725806451613e-05, + "loss": 0.2075, + "step": 7152 + }, + { + "epoch": 0.114448, + "grad_norm": 0.68359375, + "learning_rate": 8.927096774193549e-05, + "loss": 0.1746, + "step": 7153 + }, + { + "epoch": 0.114464, + "grad_norm": 0.85546875, + "learning_rate": 8.926935483870969e-05, + "loss": 0.1803, + "step": 7154 + }, + { + "epoch": 0.11448, + "grad_norm": 0.9140625, + "learning_rate": 8.926774193548387e-05, + "loss": 0.1997, + "step": 7155 + }, + { + "epoch": 0.114496, + "grad_norm": 0.7890625, + "learning_rate": 8.926612903225807e-05, + "loss": 0.1767, + "step": 7156 + }, + { + "epoch": 0.114512, + "grad_norm": 1.3671875, + "learning_rate": 8.926451612903226e-05, + "loss": 0.1717, + "step": 7157 + }, + { + "epoch": 0.114528, + "grad_norm": 0.67578125, + "learning_rate": 8.926290322580646e-05, + "loss": 0.1664, + "step": 7158 + }, + { + "epoch": 0.114544, + "grad_norm": 0.5703125, + "learning_rate": 8.926129032258064e-05, + "loss": 0.1553, + "step": 7159 + }, + { + "epoch": 0.11456, + "grad_norm": 0.76171875, + "learning_rate": 8.925967741935484e-05, + "loss": 0.1833, + "step": 7160 + }, + { + "epoch": 0.114576, + "grad_norm": 0.94140625, + "learning_rate": 8.925806451612903e-05, + "loss": 0.181, + "step": 7161 + }, + { + "epoch": 0.114592, + "grad_norm": 0.66796875, + "learning_rate": 8.925645161290323e-05, + "loss": 0.1649, + "step": 7162 + }, + { + "epoch": 0.114608, + "grad_norm": 0.890625, + "learning_rate": 8.925483870967743e-05, + "loss": 0.1554, + "step": 7163 + }, + { + "epoch": 0.114624, + "grad_norm": 0.90625, + "learning_rate": 8.925322580645162e-05, + "loss": 0.2164, + "step": 7164 + }, + { + "epoch": 0.11464, + "grad_norm": 0.90234375, + "learning_rate": 8.925161290322581e-05, + "loss": 0.1941, + "step": 7165 + }, + { + "epoch": 0.114656, + "grad_norm": 0.78515625, + "learning_rate": 8.925e-05, + "loss": 0.184, + "step": 7166 + }, + { + "epoch": 0.114672, + "grad_norm": 0.7265625, + "learning_rate": 8.92483870967742e-05, + "loss": 0.1605, + "step": 7167 + }, + { + "epoch": 0.114688, + "grad_norm": 0.65625, + "learning_rate": 8.924677419354839e-05, + "loss": 0.1415, + "step": 7168 + }, + { + "epoch": 0.114704, + "grad_norm": 0.85546875, + "learning_rate": 8.924516129032259e-05, + "loss": 0.1603, + "step": 7169 + }, + { + "epoch": 0.11472, + "grad_norm": 0.7734375, + "learning_rate": 8.924354838709677e-05, + "loss": 0.1748, + "step": 7170 + }, + { + "epoch": 0.114736, + "grad_norm": 0.97265625, + "learning_rate": 8.924193548387097e-05, + "loss": 0.2033, + "step": 7171 + }, + { + "epoch": 0.114752, + "grad_norm": 0.76953125, + "learning_rate": 8.924032258064516e-05, + "loss": 0.1808, + "step": 7172 + }, + { + "epoch": 0.114768, + "grad_norm": 1.0, + "learning_rate": 8.923870967741936e-05, + "loss": 0.1637, + "step": 7173 + }, + { + "epoch": 0.114784, + "grad_norm": 0.73046875, + "learning_rate": 8.923709677419356e-05, + "loss": 0.2137, + "step": 7174 + }, + { + "epoch": 0.1148, + "grad_norm": 0.67578125, + "learning_rate": 8.923548387096776e-05, + "loss": 0.2017, + "step": 7175 + }, + { + "epoch": 0.114816, + "grad_norm": 0.66796875, + "learning_rate": 8.923387096774194e-05, + "loss": 0.1818, + "step": 7176 + }, + { + "epoch": 0.114832, + "grad_norm": 1.0703125, + "learning_rate": 8.923225806451614e-05, + "loss": 0.2574, + "step": 7177 + }, + { + "epoch": 0.114848, + "grad_norm": 1.484375, + "learning_rate": 8.923064516129033e-05, + "loss": 0.237, + "step": 7178 + }, + { + "epoch": 0.114864, + "grad_norm": 0.8359375, + "learning_rate": 8.922903225806451e-05, + "loss": 0.209, + "step": 7179 + }, + { + "epoch": 0.11488, + "grad_norm": 0.71484375, + "learning_rate": 8.922741935483871e-05, + "loss": 0.2045, + "step": 7180 + }, + { + "epoch": 0.114896, + "grad_norm": 0.82421875, + "learning_rate": 8.92258064516129e-05, + "loss": 0.1674, + "step": 7181 + }, + { + "epoch": 0.114912, + "grad_norm": 0.74609375, + "learning_rate": 8.92241935483871e-05, + "loss": 0.1338, + "step": 7182 + }, + { + "epoch": 0.114928, + "grad_norm": 1.171875, + "learning_rate": 8.922258064516129e-05, + "loss": 0.179, + "step": 7183 + }, + { + "epoch": 0.114944, + "grad_norm": 0.5234375, + "learning_rate": 8.922096774193549e-05, + "loss": 0.1827, + "step": 7184 + }, + { + "epoch": 0.11496, + "grad_norm": 0.9609375, + "learning_rate": 8.921935483870969e-05, + "loss": 0.1836, + "step": 7185 + }, + { + "epoch": 0.114976, + "grad_norm": 1.015625, + "learning_rate": 8.921774193548387e-05, + "loss": 0.1293, + "step": 7186 + }, + { + "epoch": 0.114992, + "grad_norm": 0.9375, + "learning_rate": 8.921612903225807e-05, + "loss": 0.1275, + "step": 7187 + }, + { + "epoch": 0.115008, + "grad_norm": 0.7421875, + "learning_rate": 8.921451612903227e-05, + "loss": 0.1899, + "step": 7188 + }, + { + "epoch": 0.115024, + "grad_norm": 0.6875, + "learning_rate": 8.921290322580646e-05, + "loss": 0.171, + "step": 7189 + }, + { + "epoch": 0.11504, + "grad_norm": 1.125, + "learning_rate": 8.921129032258066e-05, + "loss": 0.1834, + "step": 7190 + }, + { + "epoch": 0.115056, + "grad_norm": 1.046875, + "learning_rate": 8.920967741935484e-05, + "loss": 0.1474, + "step": 7191 + }, + { + "epoch": 0.115072, + "grad_norm": 1.359375, + "learning_rate": 8.920806451612904e-05, + "loss": 0.1745, + "step": 7192 + }, + { + "epoch": 0.115088, + "grad_norm": 1.53125, + "learning_rate": 8.920645161290323e-05, + "loss": 0.266, + "step": 7193 + }, + { + "epoch": 0.115104, + "grad_norm": 0.984375, + "learning_rate": 8.920483870967741e-05, + "loss": 0.1739, + "step": 7194 + }, + { + "epoch": 0.11512, + "grad_norm": 1.046875, + "learning_rate": 8.920322580645161e-05, + "loss": 0.1762, + "step": 7195 + }, + { + "epoch": 0.115136, + "grad_norm": 1.078125, + "learning_rate": 8.92016129032258e-05, + "loss": 0.1928, + "step": 7196 + }, + { + "epoch": 0.115152, + "grad_norm": 0.58984375, + "learning_rate": 8.92e-05, + "loss": 0.1443, + "step": 7197 + }, + { + "epoch": 0.115168, + "grad_norm": 1.3359375, + "learning_rate": 8.91983870967742e-05, + "loss": 0.1638, + "step": 7198 + }, + { + "epoch": 0.115184, + "grad_norm": 0.7890625, + "learning_rate": 8.91967741935484e-05, + "loss": 0.1839, + "step": 7199 + }, + { + "epoch": 0.1152, + "grad_norm": 0.984375, + "learning_rate": 8.919516129032259e-05, + "loss": 0.1737, + "step": 7200 + }, + { + "epoch": 0.115216, + "grad_norm": 0.70703125, + "learning_rate": 8.919354838709678e-05, + "loss": 0.1597, + "step": 7201 + }, + { + "epoch": 0.115232, + "grad_norm": 0.546875, + "learning_rate": 8.919193548387097e-05, + "loss": 0.1816, + "step": 7202 + }, + { + "epoch": 0.115248, + "grad_norm": 0.90234375, + "learning_rate": 8.919032258064517e-05, + "loss": 0.1901, + "step": 7203 + }, + { + "epoch": 0.115264, + "grad_norm": 0.5078125, + "learning_rate": 8.918870967741936e-05, + "loss": 0.1459, + "step": 7204 + }, + { + "epoch": 0.11528, + "grad_norm": 0.6015625, + "learning_rate": 8.918709677419356e-05, + "loss": 0.2012, + "step": 7205 + }, + { + "epoch": 0.115296, + "grad_norm": 0.7421875, + "learning_rate": 8.918548387096774e-05, + "loss": 0.2197, + "step": 7206 + }, + { + "epoch": 0.115312, + "grad_norm": 0.8515625, + "learning_rate": 8.918387096774194e-05, + "loss": 0.1871, + "step": 7207 + }, + { + "epoch": 0.115328, + "grad_norm": 0.83984375, + "learning_rate": 8.918225806451613e-05, + "loss": 0.1443, + "step": 7208 + }, + { + "epoch": 0.115344, + "grad_norm": 0.8359375, + "learning_rate": 8.918064516129033e-05, + "loss": 0.176, + "step": 7209 + }, + { + "epoch": 0.11536, + "grad_norm": 0.74609375, + "learning_rate": 8.917903225806453e-05, + "loss": 0.1585, + "step": 7210 + }, + { + "epoch": 0.115376, + "grad_norm": 1.0390625, + "learning_rate": 8.917741935483871e-05, + "loss": 0.176, + "step": 7211 + }, + { + "epoch": 0.115392, + "grad_norm": 0.6875, + "learning_rate": 8.917580645161291e-05, + "loss": 0.1525, + "step": 7212 + }, + { + "epoch": 0.115408, + "grad_norm": 1.0625, + "learning_rate": 8.91741935483871e-05, + "loss": 0.1807, + "step": 7213 + }, + { + "epoch": 0.115424, + "grad_norm": 0.66015625, + "learning_rate": 8.91725806451613e-05, + "loss": 0.125, + "step": 7214 + }, + { + "epoch": 0.11544, + "grad_norm": 0.9140625, + "learning_rate": 8.917096774193548e-05, + "loss": 0.1807, + "step": 7215 + }, + { + "epoch": 0.115456, + "grad_norm": 0.5234375, + "learning_rate": 8.916935483870968e-05, + "loss": 0.1681, + "step": 7216 + }, + { + "epoch": 0.115472, + "grad_norm": 0.703125, + "learning_rate": 8.916774193548387e-05, + "loss": 0.1571, + "step": 7217 + }, + { + "epoch": 0.115488, + "grad_norm": 0.703125, + "learning_rate": 8.916612903225807e-05, + "loss": 0.1418, + "step": 7218 + }, + { + "epoch": 0.115504, + "grad_norm": 0.72265625, + "learning_rate": 8.916451612903226e-05, + "loss": 0.162, + "step": 7219 + }, + { + "epoch": 0.11552, + "grad_norm": 0.68359375, + "learning_rate": 8.916290322580646e-05, + "loss": 0.2171, + "step": 7220 + }, + { + "epoch": 0.115536, + "grad_norm": 0.93359375, + "learning_rate": 8.916129032258064e-05, + "loss": 0.1708, + "step": 7221 + }, + { + "epoch": 0.115552, + "grad_norm": 1.1953125, + "learning_rate": 8.915967741935484e-05, + "loss": 0.2286, + "step": 7222 + }, + { + "epoch": 0.115568, + "grad_norm": 1.0859375, + "learning_rate": 8.915806451612904e-05, + "loss": 0.2092, + "step": 7223 + }, + { + "epoch": 0.115584, + "grad_norm": 0.859375, + "learning_rate": 8.915645161290323e-05, + "loss": 0.2266, + "step": 7224 + }, + { + "epoch": 0.1156, + "grad_norm": 1.0625, + "learning_rate": 8.915483870967743e-05, + "loss": 0.1897, + "step": 7225 + }, + { + "epoch": 0.115616, + "grad_norm": 0.6328125, + "learning_rate": 8.915322580645161e-05, + "loss": 0.1676, + "step": 7226 + }, + { + "epoch": 0.115632, + "grad_norm": 0.78515625, + "learning_rate": 8.915161290322581e-05, + "loss": 0.1759, + "step": 7227 + }, + { + "epoch": 0.115648, + "grad_norm": 0.78125, + "learning_rate": 8.915e-05, + "loss": 0.1641, + "step": 7228 + }, + { + "epoch": 0.115664, + "grad_norm": 0.7265625, + "learning_rate": 8.91483870967742e-05, + "loss": 0.1643, + "step": 7229 + }, + { + "epoch": 0.11568, + "grad_norm": 0.62109375, + "learning_rate": 8.914677419354838e-05, + "loss": 0.1467, + "step": 7230 + }, + { + "epoch": 0.115696, + "grad_norm": 1.2265625, + "learning_rate": 8.914516129032258e-05, + "loss": 0.1225, + "step": 7231 + }, + { + "epoch": 0.115712, + "grad_norm": 0.60546875, + "learning_rate": 8.914354838709677e-05, + "loss": 0.1311, + "step": 7232 + }, + { + "epoch": 0.115728, + "grad_norm": 1.328125, + "learning_rate": 8.914193548387097e-05, + "loss": 0.2322, + "step": 7233 + }, + { + "epoch": 0.115744, + "grad_norm": 1.0078125, + "learning_rate": 8.914032258064517e-05, + "loss": 0.1719, + "step": 7234 + }, + { + "epoch": 0.11576, + "grad_norm": 0.81640625, + "learning_rate": 8.913870967741937e-05, + "loss": 0.1667, + "step": 7235 + }, + { + "epoch": 0.115776, + "grad_norm": 0.64453125, + "learning_rate": 8.913709677419355e-05, + "loss": 0.157, + "step": 7236 + }, + { + "epoch": 0.115792, + "grad_norm": 1.5, + "learning_rate": 8.913548387096775e-05, + "loss": 0.1545, + "step": 7237 + }, + { + "epoch": 0.115808, + "grad_norm": 0.7890625, + "learning_rate": 8.913387096774194e-05, + "loss": 0.1772, + "step": 7238 + }, + { + "epoch": 0.115824, + "grad_norm": 0.703125, + "learning_rate": 8.913225806451614e-05, + "loss": 0.1487, + "step": 7239 + }, + { + "epoch": 0.11584, + "grad_norm": 1.7578125, + "learning_rate": 8.913064516129033e-05, + "loss": 0.1654, + "step": 7240 + }, + { + "epoch": 0.115856, + "grad_norm": 1.6328125, + "learning_rate": 8.912903225806451e-05, + "loss": 0.2039, + "step": 7241 + }, + { + "epoch": 0.115872, + "grad_norm": 1.5625, + "learning_rate": 8.912741935483871e-05, + "loss": 0.2055, + "step": 7242 + }, + { + "epoch": 0.115888, + "grad_norm": 0.76171875, + "learning_rate": 8.91258064516129e-05, + "loss": 0.1798, + "step": 7243 + }, + { + "epoch": 0.115904, + "grad_norm": 0.55078125, + "learning_rate": 8.91241935483871e-05, + "loss": 0.1288, + "step": 7244 + }, + { + "epoch": 0.11592, + "grad_norm": 1.296875, + "learning_rate": 8.91225806451613e-05, + "loss": 0.2343, + "step": 7245 + }, + { + "epoch": 0.115936, + "grad_norm": 1.1484375, + "learning_rate": 8.91209677419355e-05, + "loss": 0.2206, + "step": 7246 + }, + { + "epoch": 0.115952, + "grad_norm": 0.95703125, + "learning_rate": 8.911935483870968e-05, + "loss": 0.1958, + "step": 7247 + }, + { + "epoch": 0.115968, + "grad_norm": 0.703125, + "learning_rate": 8.911774193548388e-05, + "loss": 0.1371, + "step": 7248 + }, + { + "epoch": 0.115984, + "grad_norm": 0.87109375, + "learning_rate": 8.911612903225807e-05, + "loss": 0.1836, + "step": 7249 + }, + { + "epoch": 0.116, + "grad_norm": 1.1796875, + "learning_rate": 8.911451612903227e-05, + "loss": 0.1534, + "step": 7250 + }, + { + "epoch": 0.116016, + "grad_norm": 0.82421875, + "learning_rate": 8.911290322580645e-05, + "loss": 0.1571, + "step": 7251 + }, + { + "epoch": 0.116032, + "grad_norm": 0.90625, + "learning_rate": 8.911129032258065e-05, + "loss": 0.215, + "step": 7252 + }, + { + "epoch": 0.116048, + "grad_norm": 0.8359375, + "learning_rate": 8.910967741935484e-05, + "loss": 0.171, + "step": 7253 + }, + { + "epoch": 0.116064, + "grad_norm": 0.515625, + "learning_rate": 8.910806451612904e-05, + "loss": 0.1395, + "step": 7254 + }, + { + "epoch": 0.11608, + "grad_norm": 0.84765625, + "learning_rate": 8.910645161290323e-05, + "loss": 0.2327, + "step": 7255 + }, + { + "epoch": 0.116096, + "grad_norm": 0.671875, + "learning_rate": 8.910483870967741e-05, + "loss": 0.1708, + "step": 7256 + }, + { + "epoch": 0.116112, + "grad_norm": 0.81640625, + "learning_rate": 8.910322580645161e-05, + "loss": 0.179, + "step": 7257 + }, + { + "epoch": 0.116128, + "grad_norm": 0.6484375, + "learning_rate": 8.910161290322581e-05, + "loss": 0.1799, + "step": 7258 + }, + { + "epoch": 0.116144, + "grad_norm": 1.7265625, + "learning_rate": 8.910000000000001e-05, + "loss": 0.203, + "step": 7259 + }, + { + "epoch": 0.11616, + "grad_norm": 0.74609375, + "learning_rate": 8.90983870967742e-05, + "loss": 0.1881, + "step": 7260 + }, + { + "epoch": 0.116176, + "grad_norm": 0.703125, + "learning_rate": 8.90967741935484e-05, + "loss": 0.157, + "step": 7261 + }, + { + "epoch": 0.116192, + "grad_norm": 0.58984375, + "learning_rate": 8.909516129032258e-05, + "loss": 0.1887, + "step": 7262 + }, + { + "epoch": 0.116208, + "grad_norm": 0.53515625, + "learning_rate": 8.909354838709678e-05, + "loss": 0.1359, + "step": 7263 + }, + { + "epoch": 0.116224, + "grad_norm": 0.6328125, + "learning_rate": 8.909193548387097e-05, + "loss": 0.1636, + "step": 7264 + }, + { + "epoch": 0.11624, + "grad_norm": 0.72265625, + "learning_rate": 8.909032258064517e-05, + "loss": 0.1254, + "step": 7265 + }, + { + "epoch": 0.116256, + "grad_norm": 0.6796875, + "learning_rate": 8.908870967741935e-05, + "loss": 0.1662, + "step": 7266 + }, + { + "epoch": 0.116272, + "grad_norm": 0.65234375, + "learning_rate": 8.908709677419355e-05, + "loss": 0.1863, + "step": 7267 + }, + { + "epoch": 0.116288, + "grad_norm": 0.7890625, + "learning_rate": 8.908548387096774e-05, + "loss": 0.1972, + "step": 7268 + }, + { + "epoch": 0.116304, + "grad_norm": 0.6796875, + "learning_rate": 8.908387096774194e-05, + "loss": 0.1734, + "step": 7269 + }, + { + "epoch": 0.11632, + "grad_norm": 0.7578125, + "learning_rate": 8.908225806451614e-05, + "loss": 0.1693, + "step": 7270 + }, + { + "epoch": 0.116336, + "grad_norm": 1.21875, + "learning_rate": 8.908064516129033e-05, + "loss": 0.1962, + "step": 7271 + }, + { + "epoch": 0.116352, + "grad_norm": 0.72265625, + "learning_rate": 8.907903225806452e-05, + "loss": 0.1797, + "step": 7272 + }, + { + "epoch": 0.116368, + "grad_norm": 1.6015625, + "learning_rate": 8.907741935483871e-05, + "loss": 0.1945, + "step": 7273 + }, + { + "epoch": 0.116384, + "grad_norm": 1.0546875, + "learning_rate": 8.907580645161291e-05, + "loss": 0.2011, + "step": 7274 + }, + { + "epoch": 0.1164, + "grad_norm": 0.92578125, + "learning_rate": 8.90741935483871e-05, + "loss": 0.1682, + "step": 7275 + }, + { + "epoch": 0.116416, + "grad_norm": 1.03125, + "learning_rate": 8.90725806451613e-05, + "loss": 0.1529, + "step": 7276 + }, + { + "epoch": 0.116432, + "grad_norm": 0.98046875, + "learning_rate": 8.907096774193548e-05, + "loss": 0.18, + "step": 7277 + }, + { + "epoch": 0.116448, + "grad_norm": 0.73828125, + "learning_rate": 8.906935483870968e-05, + "loss": 0.1741, + "step": 7278 + }, + { + "epoch": 0.116464, + "grad_norm": 0.57421875, + "learning_rate": 8.906774193548387e-05, + "loss": 0.1565, + "step": 7279 + }, + { + "epoch": 0.11648, + "grad_norm": 0.76953125, + "learning_rate": 8.906612903225807e-05, + "loss": 0.1782, + "step": 7280 + }, + { + "epoch": 0.116496, + "grad_norm": 0.66796875, + "learning_rate": 8.906451612903227e-05, + "loss": 0.1787, + "step": 7281 + }, + { + "epoch": 0.116512, + "grad_norm": 1.1796875, + "learning_rate": 8.906290322580645e-05, + "loss": 0.1706, + "step": 7282 + }, + { + "epoch": 0.116528, + "grad_norm": 0.72265625, + "learning_rate": 8.906129032258065e-05, + "loss": 0.1781, + "step": 7283 + }, + { + "epoch": 0.116544, + "grad_norm": 1.046875, + "learning_rate": 8.905967741935485e-05, + "loss": 0.191, + "step": 7284 + }, + { + "epoch": 0.11656, + "grad_norm": 1.09375, + "learning_rate": 8.905806451612904e-05, + "loss": 0.2171, + "step": 7285 + }, + { + "epoch": 0.116576, + "grad_norm": 0.515625, + "learning_rate": 8.905645161290324e-05, + "loss": 0.1733, + "step": 7286 + }, + { + "epoch": 0.116592, + "grad_norm": 0.71875, + "learning_rate": 8.905483870967742e-05, + "loss": 0.1802, + "step": 7287 + }, + { + "epoch": 0.116608, + "grad_norm": 0.71484375, + "learning_rate": 8.905322580645161e-05, + "loss": 0.1974, + "step": 7288 + }, + { + "epoch": 0.116624, + "grad_norm": 0.7421875, + "learning_rate": 8.905161290322581e-05, + "loss": 0.2005, + "step": 7289 + }, + { + "epoch": 0.11664, + "grad_norm": 0.91015625, + "learning_rate": 8.905e-05, + "loss": 0.1945, + "step": 7290 + }, + { + "epoch": 0.116656, + "grad_norm": 0.69140625, + "learning_rate": 8.90483870967742e-05, + "loss": 0.1491, + "step": 7291 + }, + { + "epoch": 0.116672, + "grad_norm": 0.8046875, + "learning_rate": 8.904677419354838e-05, + "loss": 0.2105, + "step": 7292 + }, + { + "epoch": 0.116688, + "grad_norm": 0.73828125, + "learning_rate": 8.904516129032258e-05, + "loss": 0.1995, + "step": 7293 + }, + { + "epoch": 0.116704, + "grad_norm": 1.109375, + "learning_rate": 8.904354838709678e-05, + "loss": 0.1918, + "step": 7294 + }, + { + "epoch": 0.11672, + "grad_norm": 0.71484375, + "learning_rate": 8.904193548387098e-05, + "loss": 0.1284, + "step": 7295 + }, + { + "epoch": 0.116736, + "grad_norm": 1.0, + "learning_rate": 8.904032258064517e-05, + "loss": 0.1688, + "step": 7296 + }, + { + "epoch": 0.116752, + "grad_norm": 0.6796875, + "learning_rate": 8.903870967741937e-05, + "loss": 0.1563, + "step": 7297 + }, + { + "epoch": 0.116768, + "grad_norm": 0.91015625, + "learning_rate": 8.903709677419355e-05, + "loss": 0.1892, + "step": 7298 + }, + { + "epoch": 0.116784, + "grad_norm": 0.9140625, + "learning_rate": 8.903548387096775e-05, + "loss": 0.1751, + "step": 7299 + }, + { + "epoch": 0.1168, + "grad_norm": 0.80859375, + "learning_rate": 8.903387096774194e-05, + "loss": 0.19, + "step": 7300 + }, + { + "epoch": 0.116816, + "grad_norm": 0.65625, + "learning_rate": 8.903225806451614e-05, + "loss": 0.1805, + "step": 7301 + }, + { + "epoch": 0.116832, + "grad_norm": 0.94921875, + "learning_rate": 8.903064516129032e-05, + "loss": 0.2033, + "step": 7302 + }, + { + "epoch": 0.116848, + "grad_norm": 0.7578125, + "learning_rate": 8.902903225806451e-05, + "loss": 0.1527, + "step": 7303 + }, + { + "epoch": 0.116864, + "grad_norm": 0.75, + "learning_rate": 8.902741935483871e-05, + "loss": 0.1834, + "step": 7304 + }, + { + "epoch": 0.11688, + "grad_norm": 0.86328125, + "learning_rate": 8.902580645161291e-05, + "loss": 0.19, + "step": 7305 + }, + { + "epoch": 0.116896, + "grad_norm": 0.98046875, + "learning_rate": 8.902419354838711e-05, + "loss": 0.2019, + "step": 7306 + }, + { + "epoch": 0.116912, + "grad_norm": 0.703125, + "learning_rate": 8.90225806451613e-05, + "loss": 0.1994, + "step": 7307 + }, + { + "epoch": 0.116928, + "grad_norm": 1.0234375, + "learning_rate": 8.90209677419355e-05, + "loss": 0.1434, + "step": 7308 + }, + { + "epoch": 0.116944, + "grad_norm": 0.91796875, + "learning_rate": 8.901935483870968e-05, + "loss": 0.1979, + "step": 7309 + }, + { + "epoch": 0.11696, + "grad_norm": 0.77734375, + "learning_rate": 8.901774193548388e-05, + "loss": 0.1974, + "step": 7310 + }, + { + "epoch": 0.116976, + "grad_norm": 0.94140625, + "learning_rate": 8.901612903225807e-05, + "loss": 0.1728, + "step": 7311 + }, + { + "epoch": 0.116992, + "grad_norm": 0.6953125, + "learning_rate": 8.901451612903227e-05, + "loss": 0.1658, + "step": 7312 + }, + { + "epoch": 0.117008, + "grad_norm": 0.6484375, + "learning_rate": 8.901290322580645e-05, + "loss": 0.1526, + "step": 7313 + }, + { + "epoch": 0.117024, + "grad_norm": 0.6484375, + "learning_rate": 8.901129032258065e-05, + "loss": 0.1558, + "step": 7314 + }, + { + "epoch": 0.11704, + "grad_norm": 0.75390625, + "learning_rate": 8.900967741935484e-05, + "loss": 0.1642, + "step": 7315 + }, + { + "epoch": 0.117056, + "grad_norm": 1.0703125, + "learning_rate": 8.900806451612904e-05, + "loss": 0.1751, + "step": 7316 + }, + { + "epoch": 0.117072, + "grad_norm": 1.0625, + "learning_rate": 8.900645161290322e-05, + "loss": 0.2094, + "step": 7317 + }, + { + "epoch": 0.117088, + "grad_norm": 0.93359375, + "learning_rate": 8.900483870967742e-05, + "loss": 0.1736, + "step": 7318 + }, + { + "epoch": 0.117104, + "grad_norm": 0.93359375, + "learning_rate": 8.900322580645162e-05, + "loss": 0.1971, + "step": 7319 + }, + { + "epoch": 0.11712, + "grad_norm": 1.671875, + "learning_rate": 8.900161290322581e-05, + "loss": 0.1887, + "step": 7320 + }, + { + "epoch": 0.117136, + "grad_norm": 0.98828125, + "learning_rate": 8.900000000000001e-05, + "loss": 0.2097, + "step": 7321 + }, + { + "epoch": 0.117152, + "grad_norm": 0.9296875, + "learning_rate": 8.89983870967742e-05, + "loss": 0.1898, + "step": 7322 + }, + { + "epoch": 0.117168, + "grad_norm": 1.015625, + "learning_rate": 8.89967741935484e-05, + "loss": 0.1781, + "step": 7323 + }, + { + "epoch": 0.117184, + "grad_norm": 0.6796875, + "learning_rate": 8.899516129032258e-05, + "loss": 0.1691, + "step": 7324 + }, + { + "epoch": 0.1172, + "grad_norm": 0.6171875, + "learning_rate": 8.899354838709678e-05, + "loss": 0.1614, + "step": 7325 + }, + { + "epoch": 0.117216, + "grad_norm": 0.5625, + "learning_rate": 8.899193548387097e-05, + "loss": 0.1944, + "step": 7326 + }, + { + "epoch": 0.117232, + "grad_norm": 0.62890625, + "learning_rate": 8.899032258064517e-05, + "loss": 0.2247, + "step": 7327 + }, + { + "epoch": 0.117248, + "grad_norm": 1.0390625, + "learning_rate": 8.898870967741935e-05, + "loss": 0.1861, + "step": 7328 + }, + { + "epoch": 0.117264, + "grad_norm": 0.9609375, + "learning_rate": 8.898709677419355e-05, + "loss": 0.1799, + "step": 7329 + }, + { + "epoch": 0.11728, + "grad_norm": 0.6875, + "learning_rate": 8.898548387096775e-05, + "loss": 0.1608, + "step": 7330 + }, + { + "epoch": 0.117296, + "grad_norm": 0.66015625, + "learning_rate": 8.898387096774195e-05, + "loss": 0.163, + "step": 7331 + }, + { + "epoch": 0.117312, + "grad_norm": 0.671875, + "learning_rate": 8.898225806451614e-05, + "loss": 0.1881, + "step": 7332 + }, + { + "epoch": 0.117328, + "grad_norm": 0.74609375, + "learning_rate": 8.898064516129032e-05, + "loss": 0.1926, + "step": 7333 + }, + { + "epoch": 0.117344, + "grad_norm": 0.671875, + "learning_rate": 8.897903225806452e-05, + "loss": 0.1547, + "step": 7334 + }, + { + "epoch": 0.11736, + "grad_norm": 1.1015625, + "learning_rate": 8.897741935483871e-05, + "loss": 0.1763, + "step": 7335 + }, + { + "epoch": 0.117376, + "grad_norm": 0.94921875, + "learning_rate": 8.897580645161291e-05, + "loss": 0.2081, + "step": 7336 + }, + { + "epoch": 0.117392, + "grad_norm": 0.69921875, + "learning_rate": 8.89741935483871e-05, + "loss": 0.1804, + "step": 7337 + }, + { + "epoch": 0.117408, + "grad_norm": 0.87109375, + "learning_rate": 8.89725806451613e-05, + "loss": 0.1755, + "step": 7338 + }, + { + "epoch": 0.117424, + "grad_norm": 0.8046875, + "learning_rate": 8.897096774193548e-05, + "loss": 0.1986, + "step": 7339 + }, + { + "epoch": 0.11744, + "grad_norm": 0.91015625, + "learning_rate": 8.896935483870968e-05, + "loss": 0.1512, + "step": 7340 + }, + { + "epoch": 0.117456, + "grad_norm": 0.6953125, + "learning_rate": 8.896774193548388e-05, + "loss": 0.1717, + "step": 7341 + }, + { + "epoch": 0.117472, + "grad_norm": 0.6640625, + "learning_rate": 8.896612903225808e-05, + "loss": 0.1645, + "step": 7342 + }, + { + "epoch": 0.117488, + "grad_norm": 0.62109375, + "learning_rate": 8.896451612903226e-05, + "loss": 0.1859, + "step": 7343 + }, + { + "epoch": 0.117504, + "grad_norm": 0.8515625, + "learning_rate": 8.896290322580646e-05, + "loss": 0.1781, + "step": 7344 + }, + { + "epoch": 0.11752, + "grad_norm": 0.8359375, + "learning_rate": 8.896129032258065e-05, + "loss": 0.1543, + "step": 7345 + }, + { + "epoch": 0.117536, + "grad_norm": 0.57421875, + "learning_rate": 8.895967741935485e-05, + "loss": 0.2112, + "step": 7346 + }, + { + "epoch": 0.117552, + "grad_norm": 1.0859375, + "learning_rate": 8.895806451612904e-05, + "loss": 0.1759, + "step": 7347 + }, + { + "epoch": 0.117568, + "grad_norm": 0.74609375, + "learning_rate": 8.895645161290324e-05, + "loss": 0.17, + "step": 7348 + }, + { + "epoch": 0.117584, + "grad_norm": 0.796875, + "learning_rate": 8.895483870967742e-05, + "loss": 0.2091, + "step": 7349 + }, + { + "epoch": 0.1176, + "grad_norm": 0.92578125, + "learning_rate": 8.895322580645161e-05, + "loss": 0.2365, + "step": 7350 + }, + { + "epoch": 0.117616, + "grad_norm": 0.8203125, + "learning_rate": 8.895161290322581e-05, + "loss": 0.2081, + "step": 7351 + }, + { + "epoch": 0.117632, + "grad_norm": 0.609375, + "learning_rate": 8.895e-05, + "loss": 0.1375, + "step": 7352 + }, + { + "epoch": 0.117648, + "grad_norm": 0.6015625, + "learning_rate": 8.89483870967742e-05, + "loss": 0.1867, + "step": 7353 + }, + { + "epoch": 0.117664, + "grad_norm": 0.94140625, + "learning_rate": 8.894677419354839e-05, + "loss": 0.1968, + "step": 7354 + }, + { + "epoch": 0.11768, + "grad_norm": 0.94140625, + "learning_rate": 8.894516129032259e-05, + "loss": 0.1574, + "step": 7355 + }, + { + "epoch": 0.117696, + "grad_norm": 1.0859375, + "learning_rate": 8.894354838709678e-05, + "loss": 0.2203, + "step": 7356 + }, + { + "epoch": 0.117712, + "grad_norm": 0.83203125, + "learning_rate": 8.894193548387098e-05, + "loss": 0.1342, + "step": 7357 + }, + { + "epoch": 0.117728, + "grad_norm": 0.88671875, + "learning_rate": 8.894032258064516e-05, + "loss": 0.1475, + "step": 7358 + }, + { + "epoch": 0.117744, + "grad_norm": 0.7734375, + "learning_rate": 8.893870967741936e-05, + "loss": 0.1902, + "step": 7359 + }, + { + "epoch": 0.11776, + "grad_norm": 0.73046875, + "learning_rate": 8.893709677419355e-05, + "loss": 0.1712, + "step": 7360 + }, + { + "epoch": 0.117776, + "grad_norm": 0.609375, + "learning_rate": 8.893548387096775e-05, + "loss": 0.154, + "step": 7361 + }, + { + "epoch": 0.117792, + "grad_norm": 0.8671875, + "learning_rate": 8.893387096774194e-05, + "loss": 0.1885, + "step": 7362 + }, + { + "epoch": 0.117808, + "grad_norm": 0.6171875, + "learning_rate": 8.893225806451614e-05, + "loss": 0.1908, + "step": 7363 + }, + { + "epoch": 0.117824, + "grad_norm": 0.78125, + "learning_rate": 8.893064516129032e-05, + "loss": 0.2123, + "step": 7364 + }, + { + "epoch": 0.11784, + "grad_norm": 0.6953125, + "learning_rate": 8.892903225806452e-05, + "loss": 0.1823, + "step": 7365 + }, + { + "epoch": 0.117856, + "grad_norm": 1.0, + "learning_rate": 8.892741935483872e-05, + "loss": 0.1282, + "step": 7366 + }, + { + "epoch": 0.117872, + "grad_norm": 0.54296875, + "learning_rate": 8.892580645161291e-05, + "loss": 0.1667, + "step": 7367 + }, + { + "epoch": 0.117888, + "grad_norm": 0.99609375, + "learning_rate": 8.89241935483871e-05, + "loss": 0.1594, + "step": 7368 + }, + { + "epoch": 0.117904, + "grad_norm": 0.80859375, + "learning_rate": 8.892258064516129e-05, + "loss": 0.2157, + "step": 7369 + }, + { + "epoch": 0.11792, + "grad_norm": 0.68359375, + "learning_rate": 8.892096774193549e-05, + "loss": 0.1288, + "step": 7370 + }, + { + "epoch": 0.117936, + "grad_norm": 1.0234375, + "learning_rate": 8.891935483870968e-05, + "loss": 0.1728, + "step": 7371 + }, + { + "epoch": 0.117952, + "grad_norm": 0.79296875, + "learning_rate": 8.891774193548388e-05, + "loss": 0.1643, + "step": 7372 + }, + { + "epoch": 0.117968, + "grad_norm": 0.70703125, + "learning_rate": 8.891612903225806e-05, + "loss": 0.195, + "step": 7373 + }, + { + "epoch": 0.117984, + "grad_norm": 0.63671875, + "learning_rate": 8.891451612903226e-05, + "loss": 0.158, + "step": 7374 + }, + { + "epoch": 0.118, + "grad_norm": 0.6328125, + "learning_rate": 8.891290322580645e-05, + "loss": 0.1718, + "step": 7375 + }, + { + "epoch": 0.118016, + "grad_norm": 0.984375, + "learning_rate": 8.891129032258065e-05, + "loss": 0.1827, + "step": 7376 + }, + { + "epoch": 0.118032, + "grad_norm": 0.703125, + "learning_rate": 8.890967741935484e-05, + "loss": 0.1814, + "step": 7377 + }, + { + "epoch": 0.118048, + "grad_norm": 0.6328125, + "learning_rate": 8.890806451612903e-05, + "loss": 0.175, + "step": 7378 + }, + { + "epoch": 0.118064, + "grad_norm": 0.83984375, + "learning_rate": 8.890645161290323e-05, + "loss": 0.1862, + "step": 7379 + }, + { + "epoch": 0.11808, + "grad_norm": 1.671875, + "learning_rate": 8.890483870967742e-05, + "loss": 0.2109, + "step": 7380 + }, + { + "epoch": 0.118096, + "grad_norm": 0.63671875, + "learning_rate": 8.890322580645162e-05, + "loss": 0.1465, + "step": 7381 + }, + { + "epoch": 0.118112, + "grad_norm": 0.6953125, + "learning_rate": 8.89016129032258e-05, + "loss": 0.2075, + "step": 7382 + }, + { + "epoch": 0.118128, + "grad_norm": 1.0234375, + "learning_rate": 8.89e-05, + "loss": 0.1828, + "step": 7383 + }, + { + "epoch": 0.118144, + "grad_norm": 0.734375, + "learning_rate": 8.889838709677419e-05, + "loss": 0.2135, + "step": 7384 + }, + { + "epoch": 0.11816, + "grad_norm": 0.78515625, + "learning_rate": 8.889677419354839e-05, + "loss": 0.2198, + "step": 7385 + }, + { + "epoch": 0.118176, + "grad_norm": 0.82421875, + "learning_rate": 8.889516129032258e-05, + "loss": 0.177, + "step": 7386 + }, + { + "epoch": 0.118192, + "grad_norm": 1.046875, + "learning_rate": 8.889354838709678e-05, + "loss": 0.1487, + "step": 7387 + }, + { + "epoch": 0.118208, + "grad_norm": 0.62109375, + "learning_rate": 8.889193548387096e-05, + "loss": 0.1546, + "step": 7388 + }, + { + "epoch": 0.118224, + "grad_norm": 0.578125, + "learning_rate": 8.889032258064516e-05, + "loss": 0.1709, + "step": 7389 + }, + { + "epoch": 0.11824, + "grad_norm": 0.58203125, + "learning_rate": 8.888870967741936e-05, + "loss": 0.1549, + "step": 7390 + }, + { + "epoch": 0.118256, + "grad_norm": 0.69921875, + "learning_rate": 8.888709677419356e-05, + "loss": 0.1733, + "step": 7391 + }, + { + "epoch": 0.118272, + "grad_norm": 0.69140625, + "learning_rate": 8.888548387096775e-05, + "loss": 0.188, + "step": 7392 + }, + { + "epoch": 0.118288, + "grad_norm": 1.0625, + "learning_rate": 8.888387096774195e-05, + "loss": 0.2022, + "step": 7393 + }, + { + "epoch": 0.118304, + "grad_norm": 0.62890625, + "learning_rate": 8.888225806451613e-05, + "loss": 0.1987, + "step": 7394 + }, + { + "epoch": 0.11832, + "grad_norm": 0.474609375, + "learning_rate": 8.888064516129033e-05, + "loss": 0.1425, + "step": 7395 + }, + { + "epoch": 0.118336, + "grad_norm": 1.0078125, + "learning_rate": 8.887903225806452e-05, + "loss": 0.1623, + "step": 7396 + }, + { + "epoch": 0.118352, + "grad_norm": 0.62109375, + "learning_rate": 8.88774193548387e-05, + "loss": 0.1565, + "step": 7397 + }, + { + "epoch": 0.118368, + "grad_norm": 0.56640625, + "learning_rate": 8.88758064516129e-05, + "loss": 0.1614, + "step": 7398 + }, + { + "epoch": 0.118384, + "grad_norm": 0.84765625, + "learning_rate": 8.887419354838709e-05, + "loss": 0.1836, + "step": 7399 + }, + { + "epoch": 0.1184, + "grad_norm": 0.78515625, + "learning_rate": 8.887258064516129e-05, + "loss": 0.1615, + "step": 7400 + }, + { + "epoch": 0.118416, + "grad_norm": 1.234375, + "learning_rate": 8.887096774193549e-05, + "loss": 0.1656, + "step": 7401 + }, + { + "epoch": 0.118432, + "grad_norm": 1.765625, + "learning_rate": 8.886935483870969e-05, + "loss": 0.1902, + "step": 7402 + }, + { + "epoch": 0.118448, + "grad_norm": 0.80078125, + "learning_rate": 8.886774193548388e-05, + "loss": 0.1788, + "step": 7403 + }, + { + "epoch": 0.118464, + "grad_norm": 0.734375, + "learning_rate": 8.886612903225808e-05, + "loss": 0.1973, + "step": 7404 + }, + { + "epoch": 0.11848, + "grad_norm": 0.765625, + "learning_rate": 8.886451612903226e-05, + "loss": 0.1948, + "step": 7405 + }, + { + "epoch": 0.118496, + "grad_norm": 0.61328125, + "learning_rate": 8.886290322580646e-05, + "loss": 0.1577, + "step": 7406 + }, + { + "epoch": 0.118512, + "grad_norm": 0.69140625, + "learning_rate": 8.886129032258065e-05, + "loss": 0.1876, + "step": 7407 + }, + { + "epoch": 0.118528, + "grad_norm": 0.88671875, + "learning_rate": 8.885967741935485e-05, + "loss": 0.2084, + "step": 7408 + }, + { + "epoch": 0.118544, + "grad_norm": 0.80078125, + "learning_rate": 8.885806451612903e-05, + "loss": 0.1569, + "step": 7409 + }, + { + "epoch": 0.11856, + "grad_norm": 0.9609375, + "learning_rate": 8.885645161290323e-05, + "loss": 0.2114, + "step": 7410 + }, + { + "epoch": 0.118576, + "grad_norm": 1.015625, + "learning_rate": 8.885483870967742e-05, + "loss": 0.2038, + "step": 7411 + }, + { + "epoch": 0.118592, + "grad_norm": 0.66015625, + "learning_rate": 8.88532258064516e-05, + "loss": 0.1846, + "step": 7412 + }, + { + "epoch": 0.118608, + "grad_norm": 0.65234375, + "learning_rate": 8.88516129032258e-05, + "loss": 0.1913, + "step": 7413 + }, + { + "epoch": 0.118624, + "grad_norm": 0.65234375, + "learning_rate": 8.885e-05, + "loss": 0.1794, + "step": 7414 + }, + { + "epoch": 0.11864, + "grad_norm": 0.76953125, + "learning_rate": 8.88483870967742e-05, + "loss": 0.1827, + "step": 7415 + }, + { + "epoch": 0.118656, + "grad_norm": 0.69921875, + "learning_rate": 8.884677419354839e-05, + "loss": 0.1528, + "step": 7416 + }, + { + "epoch": 0.118672, + "grad_norm": 1.2265625, + "learning_rate": 8.884516129032259e-05, + "loss": 0.2039, + "step": 7417 + }, + { + "epoch": 0.118688, + "grad_norm": 0.640625, + "learning_rate": 8.884354838709678e-05, + "loss": 0.1483, + "step": 7418 + }, + { + "epoch": 0.118704, + "grad_norm": 1.0625, + "learning_rate": 8.884193548387098e-05, + "loss": 0.2197, + "step": 7419 + }, + { + "epoch": 0.11872, + "grad_norm": 1.125, + "learning_rate": 8.884032258064516e-05, + "loss": 0.1987, + "step": 7420 + }, + { + "epoch": 0.118736, + "grad_norm": 0.5, + "learning_rate": 8.883870967741936e-05, + "loss": 0.1381, + "step": 7421 + }, + { + "epoch": 0.118752, + "grad_norm": 0.9921875, + "learning_rate": 8.883709677419355e-05, + "loss": 0.1569, + "step": 7422 + }, + { + "epoch": 0.118768, + "grad_norm": 0.75390625, + "learning_rate": 8.883548387096775e-05, + "loss": 0.1838, + "step": 7423 + }, + { + "epoch": 0.118784, + "grad_norm": 0.7265625, + "learning_rate": 8.883387096774193e-05, + "loss": 0.1864, + "step": 7424 + }, + { + "epoch": 0.1188, + "grad_norm": 0.85546875, + "learning_rate": 8.883225806451613e-05, + "loss": 0.1946, + "step": 7425 + }, + { + "epoch": 0.118816, + "grad_norm": 0.6875, + "learning_rate": 8.883064516129033e-05, + "loss": 0.2104, + "step": 7426 + }, + { + "epoch": 0.118832, + "grad_norm": 1.09375, + "learning_rate": 8.882903225806452e-05, + "loss": 0.1576, + "step": 7427 + }, + { + "epoch": 0.118848, + "grad_norm": 0.85546875, + "learning_rate": 8.882741935483872e-05, + "loss": 0.1585, + "step": 7428 + }, + { + "epoch": 0.118864, + "grad_norm": 0.6953125, + "learning_rate": 8.88258064516129e-05, + "loss": 0.2087, + "step": 7429 + }, + { + "epoch": 0.11888, + "grad_norm": 0.828125, + "learning_rate": 8.88241935483871e-05, + "loss": 0.165, + "step": 7430 + }, + { + "epoch": 0.118896, + "grad_norm": 0.58203125, + "learning_rate": 8.882258064516129e-05, + "loss": 0.1516, + "step": 7431 + }, + { + "epoch": 0.118912, + "grad_norm": 0.70703125, + "learning_rate": 8.882096774193549e-05, + "loss": 0.1939, + "step": 7432 + }, + { + "epoch": 0.118928, + "grad_norm": 0.7421875, + "learning_rate": 8.881935483870968e-05, + "loss": 0.2002, + "step": 7433 + }, + { + "epoch": 0.118944, + "grad_norm": 1.125, + "learning_rate": 8.881774193548388e-05, + "loss": 0.1627, + "step": 7434 + }, + { + "epoch": 0.11896, + "grad_norm": 0.90625, + "learning_rate": 8.881612903225806e-05, + "loss": 0.1972, + "step": 7435 + }, + { + "epoch": 0.118976, + "grad_norm": 0.78125, + "learning_rate": 8.881451612903226e-05, + "loss": 0.1877, + "step": 7436 + }, + { + "epoch": 0.118992, + "grad_norm": 0.5625, + "learning_rate": 8.881290322580646e-05, + "loss": 0.1634, + "step": 7437 + }, + { + "epoch": 0.119008, + "grad_norm": 0.6875, + "learning_rate": 8.881129032258065e-05, + "loss": 0.1784, + "step": 7438 + }, + { + "epoch": 0.119024, + "grad_norm": 0.91796875, + "learning_rate": 8.880967741935485e-05, + "loss": 0.2125, + "step": 7439 + }, + { + "epoch": 0.11904, + "grad_norm": 0.84375, + "learning_rate": 8.880806451612905e-05, + "loss": 0.1803, + "step": 7440 + }, + { + "epoch": 0.119056, + "grad_norm": 0.9609375, + "learning_rate": 8.880645161290323e-05, + "loss": 0.156, + "step": 7441 + }, + { + "epoch": 0.119072, + "grad_norm": 1.0546875, + "learning_rate": 8.880483870967742e-05, + "loss": 0.2115, + "step": 7442 + }, + { + "epoch": 0.119088, + "grad_norm": 1.2578125, + "learning_rate": 8.880322580645162e-05, + "loss": 0.178, + "step": 7443 + }, + { + "epoch": 0.119104, + "grad_norm": 2.375, + "learning_rate": 8.88016129032258e-05, + "loss": 0.2285, + "step": 7444 + }, + { + "epoch": 0.11912, + "grad_norm": 0.734375, + "learning_rate": 8.88e-05, + "loss": 0.1892, + "step": 7445 + }, + { + "epoch": 0.119136, + "grad_norm": 0.9921875, + "learning_rate": 8.879838709677419e-05, + "loss": 0.1802, + "step": 7446 + }, + { + "epoch": 0.119152, + "grad_norm": 0.890625, + "learning_rate": 8.879677419354839e-05, + "loss": 0.1997, + "step": 7447 + }, + { + "epoch": 0.119168, + "grad_norm": 0.7578125, + "learning_rate": 8.879516129032258e-05, + "loss": 0.2127, + "step": 7448 + }, + { + "epoch": 0.119184, + "grad_norm": 0.69140625, + "learning_rate": 8.879354838709678e-05, + "loss": 0.1857, + "step": 7449 + }, + { + "epoch": 0.1192, + "grad_norm": 0.64453125, + "learning_rate": 8.879193548387097e-05, + "loss": 0.1967, + "step": 7450 + }, + { + "epoch": 0.119216, + "grad_norm": 0.66015625, + "learning_rate": 8.879032258064517e-05, + "loss": 0.1524, + "step": 7451 + }, + { + "epoch": 0.119232, + "grad_norm": 0.83984375, + "learning_rate": 8.878870967741936e-05, + "loss": 0.1818, + "step": 7452 + }, + { + "epoch": 0.119248, + "grad_norm": 0.7578125, + "learning_rate": 8.878709677419356e-05, + "loss": 0.18, + "step": 7453 + }, + { + "epoch": 0.119264, + "grad_norm": 0.8125, + "learning_rate": 8.878548387096775e-05, + "loss": 0.1606, + "step": 7454 + }, + { + "epoch": 0.11928, + "grad_norm": 0.69140625, + "learning_rate": 8.878387096774195e-05, + "loss": 0.1889, + "step": 7455 + }, + { + "epoch": 0.119296, + "grad_norm": 0.859375, + "learning_rate": 8.878225806451613e-05, + "loss": 0.1917, + "step": 7456 + }, + { + "epoch": 0.119312, + "grad_norm": 0.53125, + "learning_rate": 8.878064516129033e-05, + "loss": 0.1641, + "step": 7457 + }, + { + "epoch": 0.119328, + "grad_norm": 0.80078125, + "learning_rate": 8.877903225806452e-05, + "loss": 0.1797, + "step": 7458 + }, + { + "epoch": 0.119344, + "grad_norm": 0.99609375, + "learning_rate": 8.87774193548387e-05, + "loss": 0.1887, + "step": 7459 + }, + { + "epoch": 0.11936, + "grad_norm": 0.96484375, + "learning_rate": 8.87758064516129e-05, + "loss": 0.1858, + "step": 7460 + }, + { + "epoch": 0.119376, + "grad_norm": 0.67578125, + "learning_rate": 8.87741935483871e-05, + "loss": 0.2125, + "step": 7461 + }, + { + "epoch": 0.119392, + "grad_norm": 1.15625, + "learning_rate": 8.87725806451613e-05, + "loss": 0.2162, + "step": 7462 + }, + { + "epoch": 0.119408, + "grad_norm": 0.8203125, + "learning_rate": 8.877096774193549e-05, + "loss": 0.2304, + "step": 7463 + }, + { + "epoch": 0.119424, + "grad_norm": 0.5625, + "learning_rate": 8.876935483870969e-05, + "loss": 0.1582, + "step": 7464 + }, + { + "epoch": 0.11944, + "grad_norm": 0.86328125, + "learning_rate": 8.876774193548387e-05, + "loss": 0.1693, + "step": 7465 + }, + { + "epoch": 0.119456, + "grad_norm": 1.0234375, + "learning_rate": 8.876612903225807e-05, + "loss": 0.1394, + "step": 7466 + }, + { + "epoch": 0.119472, + "grad_norm": 1.5390625, + "learning_rate": 8.876451612903226e-05, + "loss": 0.1747, + "step": 7467 + }, + { + "epoch": 0.119488, + "grad_norm": 1.7265625, + "learning_rate": 8.876290322580646e-05, + "loss": 0.1795, + "step": 7468 + }, + { + "epoch": 0.119504, + "grad_norm": 1.296875, + "learning_rate": 8.876129032258065e-05, + "loss": 0.1988, + "step": 7469 + }, + { + "epoch": 0.11952, + "grad_norm": 0.54296875, + "learning_rate": 8.875967741935485e-05, + "loss": 0.1554, + "step": 7470 + }, + { + "epoch": 0.119536, + "grad_norm": 0.75390625, + "learning_rate": 8.875806451612903e-05, + "loss": 0.1956, + "step": 7471 + }, + { + "epoch": 0.119552, + "grad_norm": 0.89453125, + "learning_rate": 8.875645161290323e-05, + "loss": 0.1896, + "step": 7472 + }, + { + "epoch": 0.119568, + "grad_norm": 0.7578125, + "learning_rate": 8.875483870967742e-05, + "loss": 0.2071, + "step": 7473 + }, + { + "epoch": 0.119584, + "grad_norm": 0.98828125, + "learning_rate": 8.875322580645162e-05, + "loss": 0.1654, + "step": 7474 + }, + { + "epoch": 0.1196, + "grad_norm": 0.6328125, + "learning_rate": 8.875161290322582e-05, + "loss": 0.1723, + "step": 7475 + }, + { + "epoch": 0.119616, + "grad_norm": 1.0859375, + "learning_rate": 8.875e-05, + "loss": 0.1507, + "step": 7476 + }, + { + "epoch": 0.119632, + "grad_norm": 0.9453125, + "learning_rate": 8.87483870967742e-05, + "loss": 0.1909, + "step": 7477 + }, + { + "epoch": 0.119648, + "grad_norm": 0.95703125, + "learning_rate": 8.874677419354839e-05, + "loss": 0.2004, + "step": 7478 + }, + { + "epoch": 0.119664, + "grad_norm": 0.71484375, + "learning_rate": 8.874516129032259e-05, + "loss": 0.1677, + "step": 7479 + }, + { + "epoch": 0.11968, + "grad_norm": 0.953125, + "learning_rate": 8.874354838709677e-05, + "loss": 0.1557, + "step": 7480 + }, + { + "epoch": 0.119696, + "grad_norm": 0.55078125, + "learning_rate": 8.874193548387097e-05, + "loss": 0.1523, + "step": 7481 + }, + { + "epoch": 0.119712, + "grad_norm": 0.59375, + "learning_rate": 8.874032258064516e-05, + "loss": 0.1345, + "step": 7482 + }, + { + "epoch": 0.119728, + "grad_norm": 0.6875, + "learning_rate": 8.873870967741936e-05, + "loss": 0.1641, + "step": 7483 + }, + { + "epoch": 0.119744, + "grad_norm": 0.8125, + "learning_rate": 8.873709677419355e-05, + "loss": 0.1671, + "step": 7484 + }, + { + "epoch": 0.11976, + "grad_norm": 1.3203125, + "learning_rate": 8.873548387096774e-05, + "loss": 0.1982, + "step": 7485 + }, + { + "epoch": 0.119776, + "grad_norm": 0.6328125, + "learning_rate": 8.873387096774194e-05, + "loss": 0.1587, + "step": 7486 + }, + { + "epoch": 0.119792, + "grad_norm": 0.9296875, + "learning_rate": 8.873225806451614e-05, + "loss": 0.1834, + "step": 7487 + }, + { + "epoch": 0.119808, + "grad_norm": 1.2421875, + "learning_rate": 8.873064516129033e-05, + "loss": 0.1587, + "step": 7488 + }, + { + "epoch": 0.119824, + "grad_norm": 0.609375, + "learning_rate": 8.872903225806452e-05, + "loss": 0.128, + "step": 7489 + }, + { + "epoch": 0.11984, + "grad_norm": 1.0, + "learning_rate": 8.872741935483872e-05, + "loss": 0.172, + "step": 7490 + }, + { + "epoch": 0.119856, + "grad_norm": 0.6484375, + "learning_rate": 8.87258064516129e-05, + "loss": 0.176, + "step": 7491 + }, + { + "epoch": 0.119872, + "grad_norm": 0.88671875, + "learning_rate": 8.87241935483871e-05, + "loss": 0.1941, + "step": 7492 + }, + { + "epoch": 0.119888, + "grad_norm": 1.09375, + "learning_rate": 8.872258064516129e-05, + "loss": 0.1894, + "step": 7493 + }, + { + "epoch": 0.119904, + "grad_norm": 0.8359375, + "learning_rate": 8.872096774193549e-05, + "loss": 0.1423, + "step": 7494 + }, + { + "epoch": 0.11992, + "grad_norm": 0.82421875, + "learning_rate": 8.871935483870967e-05, + "loss": 0.217, + "step": 7495 + }, + { + "epoch": 0.119936, + "grad_norm": 0.78125, + "learning_rate": 8.871774193548387e-05, + "loss": 0.1736, + "step": 7496 + }, + { + "epoch": 0.119952, + "grad_norm": 1.28125, + "learning_rate": 8.871612903225807e-05, + "loss": 0.199, + "step": 7497 + }, + { + "epoch": 0.119968, + "grad_norm": 0.9765625, + "learning_rate": 8.871451612903227e-05, + "loss": 0.2429, + "step": 7498 + }, + { + "epoch": 0.119984, + "grad_norm": 0.94140625, + "learning_rate": 8.871290322580646e-05, + "loss": 0.1867, + "step": 7499 + }, + { + "epoch": 0.12, + "grad_norm": 1.0234375, + "learning_rate": 8.871129032258066e-05, + "loss": 0.1944, + "step": 7500 + }, + { + "epoch": 0.120016, + "grad_norm": 0.65234375, + "learning_rate": 8.870967741935484e-05, + "loss": 0.1563, + "step": 7501 + }, + { + "epoch": 0.120032, + "grad_norm": 0.921875, + "learning_rate": 8.870806451612904e-05, + "loss": 0.1856, + "step": 7502 + }, + { + "epoch": 0.120048, + "grad_norm": 0.74609375, + "learning_rate": 8.870645161290323e-05, + "loss": 0.199, + "step": 7503 + }, + { + "epoch": 0.120064, + "grad_norm": 1.0625, + "learning_rate": 8.870483870967743e-05, + "loss": 0.2056, + "step": 7504 + }, + { + "epoch": 0.12008, + "grad_norm": 0.6875, + "learning_rate": 8.870322580645162e-05, + "loss": 0.2003, + "step": 7505 + }, + { + "epoch": 0.120096, + "grad_norm": 0.72265625, + "learning_rate": 8.87016129032258e-05, + "loss": 0.1834, + "step": 7506 + }, + { + "epoch": 0.120112, + "grad_norm": 1.234375, + "learning_rate": 8.87e-05, + "loss": 0.1567, + "step": 7507 + }, + { + "epoch": 0.120128, + "grad_norm": 0.75, + "learning_rate": 8.869838709677419e-05, + "loss": 0.1194, + "step": 7508 + }, + { + "epoch": 0.120144, + "grad_norm": 0.91796875, + "learning_rate": 8.869677419354839e-05, + "loss": 0.1991, + "step": 7509 + }, + { + "epoch": 0.12016, + "grad_norm": 0.75390625, + "learning_rate": 8.869516129032259e-05, + "loss": 0.2093, + "step": 7510 + }, + { + "epoch": 0.120176, + "grad_norm": 0.67578125, + "learning_rate": 8.869354838709679e-05, + "loss": 0.1771, + "step": 7511 + }, + { + "epoch": 0.120192, + "grad_norm": 0.5390625, + "learning_rate": 8.869193548387097e-05, + "loss": 0.1411, + "step": 7512 + }, + { + "epoch": 0.120208, + "grad_norm": 0.8046875, + "learning_rate": 8.869032258064517e-05, + "loss": 0.1833, + "step": 7513 + }, + { + "epoch": 0.120224, + "grad_norm": 0.8125, + "learning_rate": 8.868870967741936e-05, + "loss": 0.1597, + "step": 7514 + }, + { + "epoch": 0.12024, + "grad_norm": 0.96875, + "learning_rate": 8.868709677419356e-05, + "loss": 0.1824, + "step": 7515 + }, + { + "epoch": 0.120256, + "grad_norm": 0.69140625, + "learning_rate": 8.868548387096774e-05, + "loss": 0.1795, + "step": 7516 + }, + { + "epoch": 0.120272, + "grad_norm": 0.7734375, + "learning_rate": 8.868387096774194e-05, + "loss": 0.1771, + "step": 7517 + }, + { + "epoch": 0.120288, + "grad_norm": 0.66015625, + "learning_rate": 8.868225806451613e-05, + "loss": 0.184, + "step": 7518 + }, + { + "epoch": 0.120304, + "grad_norm": 0.8203125, + "learning_rate": 8.868064516129033e-05, + "loss": 0.183, + "step": 7519 + }, + { + "epoch": 0.12032, + "grad_norm": 0.9375, + "learning_rate": 8.867903225806452e-05, + "loss": 0.1749, + "step": 7520 + }, + { + "epoch": 0.120336, + "grad_norm": 1.3828125, + "learning_rate": 8.867741935483871e-05, + "loss": 0.2178, + "step": 7521 + }, + { + "epoch": 0.120352, + "grad_norm": 1.3125, + "learning_rate": 8.867580645161291e-05, + "loss": 0.1802, + "step": 7522 + }, + { + "epoch": 0.120368, + "grad_norm": 0.5390625, + "learning_rate": 8.86741935483871e-05, + "loss": 0.1771, + "step": 7523 + }, + { + "epoch": 0.120384, + "grad_norm": 0.94921875, + "learning_rate": 8.86725806451613e-05, + "loss": 0.1987, + "step": 7524 + }, + { + "epoch": 0.1204, + "grad_norm": 0.77734375, + "learning_rate": 8.867096774193549e-05, + "loss": 0.1573, + "step": 7525 + }, + { + "epoch": 0.120416, + "grad_norm": 0.87890625, + "learning_rate": 8.866935483870969e-05, + "loss": 0.1765, + "step": 7526 + }, + { + "epoch": 0.120432, + "grad_norm": 0.83203125, + "learning_rate": 8.866774193548387e-05, + "loss": 0.204, + "step": 7527 + }, + { + "epoch": 0.120448, + "grad_norm": 0.55859375, + "learning_rate": 8.866612903225807e-05, + "loss": 0.1609, + "step": 7528 + }, + { + "epoch": 0.120464, + "grad_norm": 0.66015625, + "learning_rate": 8.866451612903226e-05, + "loss": 0.1935, + "step": 7529 + }, + { + "epoch": 0.12048, + "grad_norm": 0.84375, + "learning_rate": 8.866290322580646e-05, + "loss": 0.166, + "step": 7530 + }, + { + "epoch": 0.120496, + "grad_norm": 0.73828125, + "learning_rate": 8.866129032258064e-05, + "loss": 0.1607, + "step": 7531 + }, + { + "epoch": 0.120512, + "grad_norm": 0.78515625, + "learning_rate": 8.865967741935484e-05, + "loss": 0.181, + "step": 7532 + }, + { + "epoch": 0.120528, + "grad_norm": 1.1484375, + "learning_rate": 8.865806451612904e-05, + "loss": 0.1762, + "step": 7533 + }, + { + "epoch": 0.120544, + "grad_norm": 0.6953125, + "learning_rate": 8.865645161290323e-05, + "loss": 0.1509, + "step": 7534 + }, + { + "epoch": 0.12056, + "grad_norm": 0.8828125, + "learning_rate": 8.865483870967743e-05, + "loss": 0.2192, + "step": 7535 + }, + { + "epoch": 0.120576, + "grad_norm": 1.0859375, + "learning_rate": 8.865322580645161e-05, + "loss": 0.2181, + "step": 7536 + }, + { + "epoch": 0.120592, + "grad_norm": 0.76953125, + "learning_rate": 8.865161290322581e-05, + "loss": 0.167, + "step": 7537 + }, + { + "epoch": 0.120608, + "grad_norm": 1.015625, + "learning_rate": 8.865e-05, + "loss": 0.149, + "step": 7538 + }, + { + "epoch": 0.120624, + "grad_norm": 0.94921875, + "learning_rate": 8.86483870967742e-05, + "loss": 0.1426, + "step": 7539 + }, + { + "epoch": 0.12064, + "grad_norm": 0.87109375, + "learning_rate": 8.864677419354839e-05, + "loss": 0.1724, + "step": 7540 + }, + { + "epoch": 0.120656, + "grad_norm": 0.765625, + "learning_rate": 8.864516129032259e-05, + "loss": 0.184, + "step": 7541 + }, + { + "epoch": 0.120672, + "grad_norm": 0.62890625, + "learning_rate": 8.864354838709677e-05, + "loss": 0.1831, + "step": 7542 + }, + { + "epoch": 0.120688, + "grad_norm": 0.96875, + "learning_rate": 8.864193548387097e-05, + "loss": 0.2306, + "step": 7543 + }, + { + "epoch": 0.120704, + "grad_norm": 0.80078125, + "learning_rate": 8.864032258064516e-05, + "loss": 0.1852, + "step": 7544 + }, + { + "epoch": 0.12072, + "grad_norm": 0.8125, + "learning_rate": 8.863870967741936e-05, + "loss": 0.1613, + "step": 7545 + }, + { + "epoch": 0.120736, + "grad_norm": 1.2265625, + "learning_rate": 8.863709677419356e-05, + "loss": 0.2108, + "step": 7546 + }, + { + "epoch": 0.120752, + "grad_norm": 0.73828125, + "learning_rate": 8.863548387096776e-05, + "loss": 0.1623, + "step": 7547 + }, + { + "epoch": 0.120768, + "grad_norm": 0.9296875, + "learning_rate": 8.863387096774194e-05, + "loss": 0.1566, + "step": 7548 + }, + { + "epoch": 0.120784, + "grad_norm": 1.1015625, + "learning_rate": 8.863225806451614e-05, + "loss": 0.187, + "step": 7549 + }, + { + "epoch": 0.1208, + "grad_norm": 0.64453125, + "learning_rate": 8.863064516129033e-05, + "loss": 0.1672, + "step": 7550 + }, + { + "epoch": 0.120816, + "grad_norm": 0.95703125, + "learning_rate": 8.862903225806451e-05, + "loss": 0.1946, + "step": 7551 + }, + { + "epoch": 0.120832, + "grad_norm": 0.91015625, + "learning_rate": 8.862741935483871e-05, + "loss": 0.1613, + "step": 7552 + }, + { + "epoch": 0.120848, + "grad_norm": 0.76953125, + "learning_rate": 8.86258064516129e-05, + "loss": 0.1978, + "step": 7553 + }, + { + "epoch": 0.120864, + "grad_norm": 0.8359375, + "learning_rate": 8.86241935483871e-05, + "loss": 0.1844, + "step": 7554 + }, + { + "epoch": 0.12088, + "grad_norm": 1.15625, + "learning_rate": 8.862258064516129e-05, + "loss": 0.1468, + "step": 7555 + }, + { + "epoch": 0.120896, + "grad_norm": 0.70703125, + "learning_rate": 8.862096774193548e-05, + "loss": 0.155, + "step": 7556 + }, + { + "epoch": 0.120912, + "grad_norm": 0.546875, + "learning_rate": 8.861935483870968e-05, + "loss": 0.1135, + "step": 7557 + }, + { + "epoch": 0.120928, + "grad_norm": 0.486328125, + "learning_rate": 8.861774193548388e-05, + "loss": 0.1528, + "step": 7558 + }, + { + "epoch": 0.120944, + "grad_norm": 0.55078125, + "learning_rate": 8.861612903225807e-05, + "loss": 0.1491, + "step": 7559 + }, + { + "epoch": 0.12096, + "grad_norm": 0.5859375, + "learning_rate": 8.861451612903227e-05, + "loss": 0.1813, + "step": 7560 + }, + { + "epoch": 0.120976, + "grad_norm": 0.9296875, + "learning_rate": 8.861290322580646e-05, + "loss": 0.1674, + "step": 7561 + }, + { + "epoch": 0.120992, + "grad_norm": 0.62890625, + "learning_rate": 8.861129032258066e-05, + "loss": 0.1939, + "step": 7562 + }, + { + "epoch": 0.121008, + "grad_norm": 0.74609375, + "learning_rate": 8.860967741935484e-05, + "loss": 0.1998, + "step": 7563 + }, + { + "epoch": 0.121024, + "grad_norm": 0.8359375, + "learning_rate": 8.860806451612904e-05, + "loss": 0.1677, + "step": 7564 + }, + { + "epoch": 0.12104, + "grad_norm": 0.75, + "learning_rate": 8.860645161290323e-05, + "loss": 0.1843, + "step": 7565 + }, + { + "epoch": 0.121056, + "grad_norm": 0.80078125, + "learning_rate": 8.860483870967743e-05, + "loss": 0.1998, + "step": 7566 + }, + { + "epoch": 0.121072, + "grad_norm": 1.125, + "learning_rate": 8.860322580645161e-05, + "loss": 0.1428, + "step": 7567 + }, + { + "epoch": 0.121088, + "grad_norm": 1.296875, + "learning_rate": 8.86016129032258e-05, + "loss": 0.1878, + "step": 7568 + }, + { + "epoch": 0.121104, + "grad_norm": 0.9140625, + "learning_rate": 8.86e-05, + "loss": 0.1879, + "step": 7569 + }, + { + "epoch": 0.12112, + "grad_norm": 1.296875, + "learning_rate": 8.85983870967742e-05, + "loss": 0.1909, + "step": 7570 + }, + { + "epoch": 0.121136, + "grad_norm": 0.69140625, + "learning_rate": 8.85967741935484e-05, + "loss": 0.181, + "step": 7571 + }, + { + "epoch": 0.121152, + "grad_norm": 0.64453125, + "learning_rate": 8.859516129032258e-05, + "loss": 0.1358, + "step": 7572 + }, + { + "epoch": 0.121168, + "grad_norm": 0.890625, + "learning_rate": 8.859354838709678e-05, + "loss": 0.1799, + "step": 7573 + }, + { + "epoch": 0.121184, + "grad_norm": 0.70703125, + "learning_rate": 8.859193548387097e-05, + "loss": 0.1713, + "step": 7574 + }, + { + "epoch": 0.1212, + "grad_norm": 0.72265625, + "learning_rate": 8.859032258064517e-05, + "loss": 0.1677, + "step": 7575 + }, + { + "epoch": 0.121216, + "grad_norm": 0.56640625, + "learning_rate": 8.858870967741936e-05, + "loss": 0.1853, + "step": 7576 + }, + { + "epoch": 0.121232, + "grad_norm": 0.70703125, + "learning_rate": 8.858709677419356e-05, + "loss": 0.1536, + "step": 7577 + }, + { + "epoch": 0.121248, + "grad_norm": 0.76171875, + "learning_rate": 8.858548387096774e-05, + "loss": 0.137, + "step": 7578 + }, + { + "epoch": 0.121264, + "grad_norm": 0.87890625, + "learning_rate": 8.858387096774194e-05, + "loss": 0.1635, + "step": 7579 + }, + { + "epoch": 0.12128, + "grad_norm": 0.75, + "learning_rate": 8.858225806451613e-05, + "loss": 0.1682, + "step": 7580 + }, + { + "epoch": 0.121296, + "grad_norm": 0.68359375, + "learning_rate": 8.858064516129033e-05, + "loss": 0.1439, + "step": 7581 + }, + { + "epoch": 0.121312, + "grad_norm": 0.89453125, + "learning_rate": 8.857903225806453e-05, + "loss": 0.1872, + "step": 7582 + }, + { + "epoch": 0.121328, + "grad_norm": 1.2578125, + "learning_rate": 8.857741935483871e-05, + "loss": 0.1875, + "step": 7583 + }, + { + "epoch": 0.121344, + "grad_norm": 0.9296875, + "learning_rate": 8.857580645161291e-05, + "loss": 0.1744, + "step": 7584 + }, + { + "epoch": 0.12136, + "grad_norm": 0.89453125, + "learning_rate": 8.85741935483871e-05, + "loss": 0.1662, + "step": 7585 + }, + { + "epoch": 0.121376, + "grad_norm": 1.1171875, + "learning_rate": 8.85725806451613e-05, + "loss": 0.1983, + "step": 7586 + }, + { + "epoch": 0.121392, + "grad_norm": 1.078125, + "learning_rate": 8.857096774193548e-05, + "loss": 0.1958, + "step": 7587 + }, + { + "epoch": 0.121408, + "grad_norm": 0.94140625, + "learning_rate": 8.856935483870968e-05, + "loss": 0.1584, + "step": 7588 + }, + { + "epoch": 0.121424, + "grad_norm": 0.703125, + "learning_rate": 8.856774193548387e-05, + "loss": 0.1674, + "step": 7589 + }, + { + "epoch": 0.12144, + "grad_norm": 0.73828125, + "learning_rate": 8.856612903225807e-05, + "loss": 0.1674, + "step": 7590 + }, + { + "epoch": 0.121456, + "grad_norm": 0.65234375, + "learning_rate": 8.856451612903226e-05, + "loss": 0.1642, + "step": 7591 + }, + { + "epoch": 0.121472, + "grad_norm": 0.76953125, + "learning_rate": 8.856290322580645e-05, + "loss": 0.1848, + "step": 7592 + }, + { + "epoch": 0.121488, + "grad_norm": 0.8984375, + "learning_rate": 8.856129032258065e-05, + "loss": 0.1826, + "step": 7593 + }, + { + "epoch": 0.121504, + "grad_norm": 1.171875, + "learning_rate": 8.855967741935485e-05, + "loss": 0.2438, + "step": 7594 + }, + { + "epoch": 0.12152, + "grad_norm": 0.828125, + "learning_rate": 8.855806451612904e-05, + "loss": 0.1416, + "step": 7595 + }, + { + "epoch": 0.121536, + "grad_norm": 0.71875, + "learning_rate": 8.855645161290324e-05, + "loss": 0.2006, + "step": 7596 + }, + { + "epoch": 0.121552, + "grad_norm": 0.61328125, + "learning_rate": 8.855483870967743e-05, + "loss": 0.1749, + "step": 7597 + }, + { + "epoch": 0.121568, + "grad_norm": 0.74609375, + "learning_rate": 8.855322580645161e-05, + "loss": 0.1815, + "step": 7598 + }, + { + "epoch": 0.121584, + "grad_norm": 0.640625, + "learning_rate": 8.855161290322581e-05, + "loss": 0.1884, + "step": 7599 + }, + { + "epoch": 0.1216, + "grad_norm": 0.65625, + "learning_rate": 8.855e-05, + "loss": 0.1853, + "step": 7600 + }, + { + "epoch": 0.121616, + "grad_norm": 0.9296875, + "learning_rate": 8.85483870967742e-05, + "loss": 0.1902, + "step": 7601 + }, + { + "epoch": 0.121632, + "grad_norm": 0.796875, + "learning_rate": 8.854677419354838e-05, + "loss": 0.1869, + "step": 7602 + }, + { + "epoch": 0.121648, + "grad_norm": 0.8671875, + "learning_rate": 8.854516129032258e-05, + "loss": 0.1808, + "step": 7603 + }, + { + "epoch": 0.121664, + "grad_norm": 1.0625, + "learning_rate": 8.854354838709677e-05, + "loss": 0.1976, + "step": 7604 + }, + { + "epoch": 0.12168, + "grad_norm": 0.99609375, + "learning_rate": 8.854193548387097e-05, + "loss": 0.2293, + "step": 7605 + }, + { + "epoch": 0.121696, + "grad_norm": 0.78515625, + "learning_rate": 8.854032258064517e-05, + "loss": 0.2007, + "step": 7606 + }, + { + "epoch": 0.121712, + "grad_norm": 1.0078125, + "learning_rate": 8.853870967741937e-05, + "loss": 0.1468, + "step": 7607 + }, + { + "epoch": 0.121728, + "grad_norm": 1.1015625, + "learning_rate": 8.853709677419355e-05, + "loss": 0.1802, + "step": 7608 + }, + { + "epoch": 0.121744, + "grad_norm": 1.03125, + "learning_rate": 8.853548387096775e-05, + "loss": 0.2131, + "step": 7609 + }, + { + "epoch": 0.12176, + "grad_norm": 1.140625, + "learning_rate": 8.853387096774194e-05, + "loss": 0.1835, + "step": 7610 + }, + { + "epoch": 0.121776, + "grad_norm": 0.83203125, + "learning_rate": 8.853225806451614e-05, + "loss": 0.1813, + "step": 7611 + }, + { + "epoch": 0.121792, + "grad_norm": 0.8359375, + "learning_rate": 8.853064516129033e-05, + "loss": 0.1455, + "step": 7612 + }, + { + "epoch": 0.121808, + "grad_norm": 0.65234375, + "learning_rate": 8.852903225806451e-05, + "loss": 0.1749, + "step": 7613 + }, + { + "epoch": 0.121824, + "grad_norm": 1.4296875, + "learning_rate": 8.852741935483871e-05, + "loss": 0.1822, + "step": 7614 + }, + { + "epoch": 0.12184, + "grad_norm": 1.4140625, + "learning_rate": 8.85258064516129e-05, + "loss": 0.1878, + "step": 7615 + }, + { + "epoch": 0.121856, + "grad_norm": 0.921875, + "learning_rate": 8.85241935483871e-05, + "loss": 0.1651, + "step": 7616 + }, + { + "epoch": 0.121872, + "grad_norm": 1.6015625, + "learning_rate": 8.85225806451613e-05, + "loss": 0.1794, + "step": 7617 + }, + { + "epoch": 0.121888, + "grad_norm": 0.921875, + "learning_rate": 8.85209677419355e-05, + "loss": 0.1636, + "step": 7618 + }, + { + "epoch": 0.121904, + "grad_norm": 0.8125, + "learning_rate": 8.851935483870968e-05, + "loss": 0.1763, + "step": 7619 + }, + { + "epoch": 0.12192, + "grad_norm": 0.7421875, + "learning_rate": 8.851774193548388e-05, + "loss": 0.1812, + "step": 7620 + }, + { + "epoch": 0.121936, + "grad_norm": 1.015625, + "learning_rate": 8.851612903225807e-05, + "loss": 0.1804, + "step": 7621 + }, + { + "epoch": 0.121952, + "grad_norm": 1.0078125, + "learning_rate": 8.851451612903227e-05, + "loss": 0.1489, + "step": 7622 + }, + { + "epoch": 0.121968, + "grad_norm": 1.265625, + "learning_rate": 8.851290322580645e-05, + "loss": 0.1561, + "step": 7623 + }, + { + "epoch": 0.121984, + "grad_norm": 1.359375, + "learning_rate": 8.851129032258065e-05, + "loss": 0.1959, + "step": 7624 + }, + { + "epoch": 0.122, + "grad_norm": 0.8359375, + "learning_rate": 8.850967741935484e-05, + "loss": 0.1817, + "step": 7625 + }, + { + "epoch": 0.122016, + "grad_norm": 0.73046875, + "learning_rate": 8.850806451612904e-05, + "loss": 0.1703, + "step": 7626 + }, + { + "epoch": 0.122032, + "grad_norm": 0.98828125, + "learning_rate": 8.850645161290322e-05, + "loss": 0.2248, + "step": 7627 + }, + { + "epoch": 0.122048, + "grad_norm": 0.78515625, + "learning_rate": 8.850483870967742e-05, + "loss": 0.1721, + "step": 7628 + }, + { + "epoch": 0.122064, + "grad_norm": 0.81640625, + "learning_rate": 8.850322580645161e-05, + "loss": 0.1563, + "step": 7629 + }, + { + "epoch": 0.12208, + "grad_norm": 0.75, + "learning_rate": 8.850161290322581e-05, + "loss": 0.1895, + "step": 7630 + }, + { + "epoch": 0.122096, + "grad_norm": 0.89453125, + "learning_rate": 8.850000000000001e-05, + "loss": 0.1926, + "step": 7631 + }, + { + "epoch": 0.122112, + "grad_norm": 0.91796875, + "learning_rate": 8.84983870967742e-05, + "loss": 0.1743, + "step": 7632 + }, + { + "epoch": 0.122128, + "grad_norm": 1.0390625, + "learning_rate": 8.84967741935484e-05, + "loss": 0.1671, + "step": 7633 + }, + { + "epoch": 0.122144, + "grad_norm": 0.77734375, + "learning_rate": 8.849516129032258e-05, + "loss": 0.1625, + "step": 7634 + }, + { + "epoch": 0.12216, + "grad_norm": 1.1953125, + "learning_rate": 8.849354838709678e-05, + "loss": 0.1373, + "step": 7635 + }, + { + "epoch": 0.122176, + "grad_norm": 0.95703125, + "learning_rate": 8.849193548387097e-05, + "loss": 0.1667, + "step": 7636 + }, + { + "epoch": 0.122192, + "grad_norm": 0.8671875, + "learning_rate": 8.849032258064517e-05, + "loss": 0.1846, + "step": 7637 + }, + { + "epoch": 0.122208, + "grad_norm": 0.640625, + "learning_rate": 8.848870967741935e-05, + "loss": 0.1751, + "step": 7638 + }, + { + "epoch": 0.122224, + "grad_norm": 0.55859375, + "learning_rate": 8.848709677419355e-05, + "loss": 0.1704, + "step": 7639 + }, + { + "epoch": 0.12224, + "grad_norm": 0.70703125, + "learning_rate": 8.848548387096774e-05, + "loss": 0.1939, + "step": 7640 + }, + { + "epoch": 0.122256, + "grad_norm": 0.69921875, + "learning_rate": 8.848387096774194e-05, + "loss": 0.172, + "step": 7641 + }, + { + "epoch": 0.122272, + "grad_norm": 1.0625, + "learning_rate": 8.848225806451614e-05, + "loss": 0.1812, + "step": 7642 + }, + { + "epoch": 0.122288, + "grad_norm": 0.58203125, + "learning_rate": 8.848064516129034e-05, + "loss": 0.1655, + "step": 7643 + }, + { + "epoch": 0.122304, + "grad_norm": 0.74609375, + "learning_rate": 8.847903225806452e-05, + "loss": 0.1648, + "step": 7644 + }, + { + "epoch": 0.12232, + "grad_norm": 1.171875, + "learning_rate": 8.847741935483871e-05, + "loss": 0.2066, + "step": 7645 + }, + { + "epoch": 0.122336, + "grad_norm": 0.57421875, + "learning_rate": 8.847580645161291e-05, + "loss": 0.1619, + "step": 7646 + }, + { + "epoch": 0.122352, + "grad_norm": 1.0859375, + "learning_rate": 8.84741935483871e-05, + "loss": 0.2023, + "step": 7647 + }, + { + "epoch": 0.122368, + "grad_norm": 1.203125, + "learning_rate": 8.84725806451613e-05, + "loss": 0.135, + "step": 7648 + }, + { + "epoch": 0.122384, + "grad_norm": 1.0859375, + "learning_rate": 8.847096774193548e-05, + "loss": 0.1773, + "step": 7649 + }, + { + "epoch": 0.1224, + "grad_norm": 0.5546875, + "learning_rate": 8.846935483870968e-05, + "loss": 0.1853, + "step": 7650 + }, + { + "epoch": 0.122416, + "grad_norm": 2.65625, + "learning_rate": 8.846774193548387e-05, + "loss": 0.2015, + "step": 7651 + }, + { + "epoch": 0.122432, + "grad_norm": 1.859375, + "learning_rate": 8.846612903225807e-05, + "loss": 0.1422, + "step": 7652 + }, + { + "epoch": 0.122448, + "grad_norm": 0.87890625, + "learning_rate": 8.846451612903227e-05, + "loss": 0.2599, + "step": 7653 + }, + { + "epoch": 0.122464, + "grad_norm": 1.296875, + "learning_rate": 8.846290322580647e-05, + "loss": 0.1771, + "step": 7654 + }, + { + "epoch": 0.12248, + "grad_norm": 0.84375, + "learning_rate": 8.846129032258065e-05, + "loss": 0.1681, + "step": 7655 + }, + { + "epoch": 0.122496, + "grad_norm": 0.6796875, + "learning_rate": 8.845967741935485e-05, + "loss": 0.1218, + "step": 7656 + }, + { + "epoch": 0.122512, + "grad_norm": 0.96484375, + "learning_rate": 8.845806451612904e-05, + "loss": 0.1878, + "step": 7657 + }, + { + "epoch": 0.122528, + "grad_norm": 1.4453125, + "learning_rate": 8.845645161290324e-05, + "loss": 0.2088, + "step": 7658 + }, + { + "epoch": 0.122544, + "grad_norm": 0.83203125, + "learning_rate": 8.845483870967742e-05, + "loss": 0.1788, + "step": 7659 + }, + { + "epoch": 0.12256, + "grad_norm": 0.8203125, + "learning_rate": 8.845322580645161e-05, + "loss": 0.1806, + "step": 7660 + }, + { + "epoch": 0.122576, + "grad_norm": 0.6484375, + "learning_rate": 8.845161290322581e-05, + "loss": 0.1779, + "step": 7661 + }, + { + "epoch": 0.122592, + "grad_norm": 0.6640625, + "learning_rate": 8.845e-05, + "loss": 0.1801, + "step": 7662 + }, + { + "epoch": 0.122608, + "grad_norm": 0.7578125, + "learning_rate": 8.84483870967742e-05, + "loss": 0.1998, + "step": 7663 + }, + { + "epoch": 0.122624, + "grad_norm": 0.66015625, + "learning_rate": 8.844677419354838e-05, + "loss": 0.1684, + "step": 7664 + }, + { + "epoch": 0.12264, + "grad_norm": 0.57421875, + "learning_rate": 8.844516129032258e-05, + "loss": 0.1336, + "step": 7665 + }, + { + "epoch": 0.122656, + "grad_norm": 1.0, + "learning_rate": 8.844354838709678e-05, + "loss": 0.1611, + "step": 7666 + }, + { + "epoch": 0.122672, + "grad_norm": 1.03125, + "learning_rate": 8.844193548387098e-05, + "loss": 0.2143, + "step": 7667 + }, + { + "epoch": 0.122688, + "grad_norm": 0.92578125, + "learning_rate": 8.844032258064517e-05, + "loss": 0.1586, + "step": 7668 + }, + { + "epoch": 0.122704, + "grad_norm": 0.90234375, + "learning_rate": 8.843870967741937e-05, + "loss": 0.1569, + "step": 7669 + }, + { + "epoch": 0.12272, + "grad_norm": 0.83203125, + "learning_rate": 8.843709677419355e-05, + "loss": 0.2018, + "step": 7670 + }, + { + "epoch": 0.122736, + "grad_norm": 0.91015625, + "learning_rate": 8.843548387096775e-05, + "loss": 0.1892, + "step": 7671 + }, + { + "epoch": 0.122752, + "grad_norm": 0.6171875, + "learning_rate": 8.843387096774194e-05, + "loss": 0.1683, + "step": 7672 + }, + { + "epoch": 0.122768, + "grad_norm": 0.75390625, + "learning_rate": 8.843225806451614e-05, + "loss": 0.1872, + "step": 7673 + }, + { + "epoch": 0.122784, + "grad_norm": 0.7890625, + "learning_rate": 8.843064516129032e-05, + "loss": 0.1452, + "step": 7674 + }, + { + "epoch": 0.1228, + "grad_norm": 0.8515625, + "learning_rate": 8.842903225806452e-05, + "loss": 0.1833, + "step": 7675 + }, + { + "epoch": 0.122816, + "grad_norm": 0.83203125, + "learning_rate": 8.842741935483871e-05, + "loss": 0.1419, + "step": 7676 + }, + { + "epoch": 0.122832, + "grad_norm": 1.203125, + "learning_rate": 8.842580645161291e-05, + "loss": 0.209, + "step": 7677 + }, + { + "epoch": 0.122848, + "grad_norm": 0.66015625, + "learning_rate": 8.842419354838711e-05, + "loss": 0.1726, + "step": 7678 + }, + { + "epoch": 0.122864, + "grad_norm": 1.09375, + "learning_rate": 8.84225806451613e-05, + "loss": 0.2035, + "step": 7679 + }, + { + "epoch": 0.12288, + "grad_norm": 1.109375, + "learning_rate": 8.84209677419355e-05, + "loss": 0.1818, + "step": 7680 + }, + { + "epoch": 0.122896, + "grad_norm": 0.6953125, + "learning_rate": 8.841935483870968e-05, + "loss": 0.1602, + "step": 7681 + }, + { + "epoch": 0.122912, + "grad_norm": 1.2109375, + "learning_rate": 8.841774193548388e-05, + "loss": 0.1712, + "step": 7682 + }, + { + "epoch": 0.122928, + "grad_norm": 1.1171875, + "learning_rate": 8.841612903225807e-05, + "loss": 0.1892, + "step": 7683 + }, + { + "epoch": 0.122944, + "grad_norm": 0.6875, + "learning_rate": 8.841451612903226e-05, + "loss": 0.1542, + "step": 7684 + }, + { + "epoch": 0.12296, + "grad_norm": 0.73046875, + "learning_rate": 8.841290322580645e-05, + "loss": 0.1503, + "step": 7685 + }, + { + "epoch": 0.122976, + "grad_norm": 0.57421875, + "learning_rate": 8.841129032258065e-05, + "loss": 0.1737, + "step": 7686 + }, + { + "epoch": 0.122992, + "grad_norm": 0.75390625, + "learning_rate": 8.840967741935484e-05, + "loss": 0.2088, + "step": 7687 + }, + { + "epoch": 0.123008, + "grad_norm": 0.76953125, + "learning_rate": 8.840806451612904e-05, + "loss": 0.1793, + "step": 7688 + }, + { + "epoch": 0.123024, + "grad_norm": 0.625, + "learning_rate": 8.840645161290324e-05, + "loss": 0.1656, + "step": 7689 + }, + { + "epoch": 0.12304, + "grad_norm": 0.93359375, + "learning_rate": 8.840483870967742e-05, + "loss": 0.152, + "step": 7690 + }, + { + "epoch": 0.123056, + "grad_norm": 1.0625, + "learning_rate": 8.840322580645162e-05, + "loss": 0.1951, + "step": 7691 + }, + { + "epoch": 0.123072, + "grad_norm": 1.1875, + "learning_rate": 8.840161290322581e-05, + "loss": 0.2178, + "step": 7692 + }, + { + "epoch": 0.123088, + "grad_norm": 0.86328125, + "learning_rate": 8.840000000000001e-05, + "loss": 0.1813, + "step": 7693 + }, + { + "epoch": 0.123104, + "grad_norm": 0.98828125, + "learning_rate": 8.83983870967742e-05, + "loss": 0.1718, + "step": 7694 + }, + { + "epoch": 0.12312, + "grad_norm": 0.66015625, + "learning_rate": 8.839677419354839e-05, + "loss": 0.1321, + "step": 7695 + }, + { + "epoch": 0.123136, + "grad_norm": 0.640625, + "learning_rate": 8.839516129032258e-05, + "loss": 0.1566, + "step": 7696 + }, + { + "epoch": 0.123152, + "grad_norm": 0.69140625, + "learning_rate": 8.839354838709678e-05, + "loss": 0.1526, + "step": 7697 + }, + { + "epoch": 0.123168, + "grad_norm": 0.8046875, + "learning_rate": 8.839193548387096e-05, + "loss": 0.1803, + "step": 7698 + }, + { + "epoch": 0.123184, + "grad_norm": 0.59765625, + "learning_rate": 8.839032258064516e-05, + "loss": 0.1521, + "step": 7699 + }, + { + "epoch": 0.1232, + "grad_norm": 1.078125, + "learning_rate": 8.838870967741935e-05, + "loss": 0.1972, + "step": 7700 + }, + { + "epoch": 0.123216, + "grad_norm": 0.59765625, + "learning_rate": 8.838709677419355e-05, + "loss": 0.1502, + "step": 7701 + }, + { + "epoch": 0.123232, + "grad_norm": 0.890625, + "learning_rate": 8.838548387096775e-05, + "loss": 0.1942, + "step": 7702 + }, + { + "epoch": 0.123248, + "grad_norm": 0.79296875, + "learning_rate": 8.838387096774195e-05, + "loss": 0.1892, + "step": 7703 + }, + { + "epoch": 0.123264, + "grad_norm": 1.09375, + "learning_rate": 8.838225806451614e-05, + "loss": 0.2077, + "step": 7704 + }, + { + "epoch": 0.12328, + "grad_norm": 0.66796875, + "learning_rate": 8.838064516129034e-05, + "loss": 0.1545, + "step": 7705 + }, + { + "epoch": 0.123296, + "grad_norm": 1.234375, + "learning_rate": 8.837903225806452e-05, + "loss": 0.2078, + "step": 7706 + }, + { + "epoch": 0.123312, + "grad_norm": 0.9296875, + "learning_rate": 8.837741935483871e-05, + "loss": 0.1847, + "step": 7707 + }, + { + "epoch": 0.123328, + "grad_norm": 0.72265625, + "learning_rate": 8.837580645161291e-05, + "loss": 0.1742, + "step": 7708 + }, + { + "epoch": 0.123344, + "grad_norm": 1.25, + "learning_rate": 8.837419354838709e-05, + "loss": 0.19, + "step": 7709 + }, + { + "epoch": 0.12336, + "grad_norm": 0.8984375, + "learning_rate": 8.837258064516129e-05, + "loss": 0.2347, + "step": 7710 + }, + { + "epoch": 0.123376, + "grad_norm": 0.69140625, + "learning_rate": 8.837096774193548e-05, + "loss": 0.1822, + "step": 7711 + }, + { + "epoch": 0.123392, + "grad_norm": 0.53125, + "learning_rate": 8.836935483870968e-05, + "loss": 0.1565, + "step": 7712 + }, + { + "epoch": 0.123408, + "grad_norm": 1.859375, + "learning_rate": 8.836774193548388e-05, + "loss": 0.2332, + "step": 7713 + }, + { + "epoch": 0.123424, + "grad_norm": 0.84765625, + "learning_rate": 8.836612903225808e-05, + "loss": 0.2039, + "step": 7714 + }, + { + "epoch": 0.12344, + "grad_norm": 1.0703125, + "learning_rate": 8.836451612903226e-05, + "loss": 0.1342, + "step": 7715 + }, + { + "epoch": 0.123456, + "grad_norm": 0.84765625, + "learning_rate": 8.836290322580646e-05, + "loss": 0.2026, + "step": 7716 + }, + { + "epoch": 0.123472, + "grad_norm": 0.546875, + "learning_rate": 8.836129032258065e-05, + "loss": 0.1744, + "step": 7717 + }, + { + "epoch": 0.123488, + "grad_norm": 0.703125, + "learning_rate": 8.835967741935485e-05, + "loss": 0.1468, + "step": 7718 + }, + { + "epoch": 0.123504, + "grad_norm": 1.921875, + "learning_rate": 8.835806451612904e-05, + "loss": 0.1836, + "step": 7719 + }, + { + "epoch": 0.12352, + "grad_norm": 0.6171875, + "learning_rate": 8.835645161290323e-05, + "loss": 0.1452, + "step": 7720 + }, + { + "epoch": 0.123536, + "grad_norm": 0.6171875, + "learning_rate": 8.835483870967742e-05, + "loss": 0.1344, + "step": 7721 + }, + { + "epoch": 0.123552, + "grad_norm": 0.73828125, + "learning_rate": 8.835322580645161e-05, + "loss": 0.1901, + "step": 7722 + }, + { + "epoch": 0.123568, + "grad_norm": 1.0078125, + "learning_rate": 8.83516129032258e-05, + "loss": 0.161, + "step": 7723 + }, + { + "epoch": 0.123584, + "grad_norm": 0.828125, + "learning_rate": 8.834999999999999e-05, + "loss": 0.2083, + "step": 7724 + }, + { + "epoch": 0.1236, + "grad_norm": 0.94921875, + "learning_rate": 8.834838709677419e-05, + "loss": 0.2173, + "step": 7725 + }, + { + "epoch": 0.123616, + "grad_norm": 0.9140625, + "learning_rate": 8.834677419354839e-05, + "loss": 0.1659, + "step": 7726 + }, + { + "epoch": 0.123632, + "grad_norm": 0.67578125, + "learning_rate": 8.834516129032259e-05, + "loss": 0.1804, + "step": 7727 + }, + { + "epoch": 0.123648, + "grad_norm": 0.60546875, + "learning_rate": 8.834354838709678e-05, + "loss": 0.1736, + "step": 7728 + }, + { + "epoch": 0.123664, + "grad_norm": 0.9375, + "learning_rate": 8.834193548387098e-05, + "loss": 0.2023, + "step": 7729 + }, + { + "epoch": 0.12368, + "grad_norm": 0.81640625, + "learning_rate": 8.834032258064516e-05, + "loss": 0.1645, + "step": 7730 + }, + { + "epoch": 0.123696, + "grad_norm": 0.93359375, + "learning_rate": 8.833870967741936e-05, + "loss": 0.204, + "step": 7731 + }, + { + "epoch": 0.123712, + "grad_norm": 1.203125, + "learning_rate": 8.833709677419355e-05, + "loss": 0.1445, + "step": 7732 + }, + { + "epoch": 0.123728, + "grad_norm": 0.7265625, + "learning_rate": 8.833548387096775e-05, + "loss": 0.1786, + "step": 7733 + }, + { + "epoch": 0.123744, + "grad_norm": 0.78515625, + "learning_rate": 8.833387096774193e-05, + "loss": 0.1868, + "step": 7734 + }, + { + "epoch": 0.12376, + "grad_norm": 0.59375, + "learning_rate": 8.833225806451613e-05, + "loss": 0.1541, + "step": 7735 + }, + { + "epoch": 0.123776, + "grad_norm": 1.40625, + "learning_rate": 8.833064516129032e-05, + "loss": 0.2079, + "step": 7736 + }, + { + "epoch": 0.123792, + "grad_norm": 0.69921875, + "learning_rate": 8.832903225806452e-05, + "loss": 0.1944, + "step": 7737 + }, + { + "epoch": 0.123808, + "grad_norm": 0.671875, + "learning_rate": 8.832741935483872e-05, + "loss": 0.1876, + "step": 7738 + }, + { + "epoch": 0.123824, + "grad_norm": 0.76953125, + "learning_rate": 8.83258064516129e-05, + "loss": 0.2166, + "step": 7739 + }, + { + "epoch": 0.12384, + "grad_norm": 0.890625, + "learning_rate": 8.83241935483871e-05, + "loss": 0.1735, + "step": 7740 + }, + { + "epoch": 0.123856, + "grad_norm": 1.0234375, + "learning_rate": 8.832258064516129e-05, + "loss": 0.2157, + "step": 7741 + }, + { + "epoch": 0.123872, + "grad_norm": 0.71875, + "learning_rate": 8.832096774193549e-05, + "loss": 0.1542, + "step": 7742 + }, + { + "epoch": 0.123888, + "grad_norm": 0.83203125, + "learning_rate": 8.831935483870968e-05, + "loss": 0.1806, + "step": 7743 + }, + { + "epoch": 0.123904, + "grad_norm": 1.1015625, + "learning_rate": 8.831774193548388e-05, + "loss": 0.179, + "step": 7744 + }, + { + "epoch": 0.12392, + "grad_norm": 2.28125, + "learning_rate": 8.831612903225806e-05, + "loss": 0.2021, + "step": 7745 + }, + { + "epoch": 0.123936, + "grad_norm": 0.72265625, + "learning_rate": 8.831451612903226e-05, + "loss": 0.1675, + "step": 7746 + }, + { + "epoch": 0.123952, + "grad_norm": 0.90234375, + "learning_rate": 8.831290322580645e-05, + "loss": 0.1853, + "step": 7747 + }, + { + "epoch": 0.123968, + "grad_norm": 0.8125, + "learning_rate": 8.831129032258065e-05, + "loss": 0.1874, + "step": 7748 + }, + { + "epoch": 0.123984, + "grad_norm": 0.86328125, + "learning_rate": 8.830967741935485e-05, + "loss": 0.1465, + "step": 7749 + }, + { + "epoch": 0.124, + "grad_norm": 0.97265625, + "learning_rate": 8.830806451612905e-05, + "loss": 0.1928, + "step": 7750 + }, + { + "epoch": 0.124016, + "grad_norm": 1.234375, + "learning_rate": 8.830645161290323e-05, + "loss": 0.1771, + "step": 7751 + }, + { + "epoch": 0.124032, + "grad_norm": 0.6875, + "learning_rate": 8.830483870967743e-05, + "loss": 0.1694, + "step": 7752 + }, + { + "epoch": 0.124048, + "grad_norm": 0.83984375, + "learning_rate": 8.830322580645162e-05, + "loss": 0.1747, + "step": 7753 + }, + { + "epoch": 0.124064, + "grad_norm": 0.7109375, + "learning_rate": 8.83016129032258e-05, + "loss": 0.1949, + "step": 7754 + }, + { + "epoch": 0.12408, + "grad_norm": 0.546875, + "learning_rate": 8.83e-05, + "loss": 0.1592, + "step": 7755 + }, + { + "epoch": 0.124096, + "grad_norm": 0.6015625, + "learning_rate": 8.829838709677419e-05, + "loss": 0.17, + "step": 7756 + }, + { + "epoch": 0.124112, + "grad_norm": 0.546875, + "learning_rate": 8.829677419354839e-05, + "loss": 0.1852, + "step": 7757 + }, + { + "epoch": 0.124128, + "grad_norm": 0.6796875, + "learning_rate": 8.829516129032258e-05, + "loss": 0.1874, + "step": 7758 + }, + { + "epoch": 0.124144, + "grad_norm": 0.79296875, + "learning_rate": 8.829354838709678e-05, + "loss": 0.1756, + "step": 7759 + }, + { + "epoch": 0.12416, + "grad_norm": 0.73828125, + "learning_rate": 8.829193548387096e-05, + "loss": 0.1581, + "step": 7760 + }, + { + "epoch": 0.124176, + "grad_norm": 0.640625, + "learning_rate": 8.829032258064516e-05, + "loss": 0.1875, + "step": 7761 + }, + { + "epoch": 0.124192, + "grad_norm": 0.8125, + "learning_rate": 8.828870967741936e-05, + "loss": 0.2003, + "step": 7762 + }, + { + "epoch": 0.124208, + "grad_norm": 0.68359375, + "learning_rate": 8.828709677419356e-05, + "loss": 0.1581, + "step": 7763 + }, + { + "epoch": 0.124224, + "grad_norm": 0.87890625, + "learning_rate": 8.828548387096775e-05, + "loss": 0.1932, + "step": 7764 + }, + { + "epoch": 0.12424, + "grad_norm": 0.71484375, + "learning_rate": 8.828387096774195e-05, + "loss": 0.1796, + "step": 7765 + }, + { + "epoch": 0.124256, + "grad_norm": 1.078125, + "learning_rate": 8.828225806451613e-05, + "loss": 0.1569, + "step": 7766 + }, + { + "epoch": 0.124272, + "grad_norm": 0.61328125, + "learning_rate": 8.828064516129033e-05, + "loss": 0.1766, + "step": 7767 + }, + { + "epoch": 0.124288, + "grad_norm": 0.60546875, + "learning_rate": 8.827903225806452e-05, + "loss": 0.1507, + "step": 7768 + }, + { + "epoch": 0.124304, + "grad_norm": 0.625, + "learning_rate": 8.82774193548387e-05, + "loss": 0.1768, + "step": 7769 + }, + { + "epoch": 0.12432, + "grad_norm": 1.2890625, + "learning_rate": 8.82758064516129e-05, + "loss": 0.1988, + "step": 7770 + }, + { + "epoch": 0.124336, + "grad_norm": 0.875, + "learning_rate": 8.827419354838709e-05, + "loss": 0.2052, + "step": 7771 + }, + { + "epoch": 0.124352, + "grad_norm": 1.078125, + "learning_rate": 8.827258064516129e-05, + "loss": 0.1792, + "step": 7772 + }, + { + "epoch": 0.124368, + "grad_norm": 0.71484375, + "learning_rate": 8.827096774193549e-05, + "loss": 0.2047, + "step": 7773 + }, + { + "epoch": 0.124384, + "grad_norm": 0.83984375, + "learning_rate": 8.826935483870969e-05, + "loss": 0.1655, + "step": 7774 + }, + { + "epoch": 0.1244, + "grad_norm": 0.84375, + "learning_rate": 8.826774193548388e-05, + "loss": 0.1603, + "step": 7775 + }, + { + "epoch": 0.124416, + "grad_norm": 0.59765625, + "learning_rate": 8.826612903225808e-05, + "loss": 0.1564, + "step": 7776 + }, + { + "epoch": 0.124432, + "grad_norm": 0.81640625, + "learning_rate": 8.826451612903226e-05, + "loss": 0.1859, + "step": 7777 + }, + { + "epoch": 0.124448, + "grad_norm": 0.8984375, + "learning_rate": 8.826290322580646e-05, + "loss": 0.1595, + "step": 7778 + }, + { + "epoch": 0.124464, + "grad_norm": 0.703125, + "learning_rate": 8.826129032258065e-05, + "loss": 0.1862, + "step": 7779 + }, + { + "epoch": 0.12448, + "grad_norm": 0.90625, + "learning_rate": 8.825967741935485e-05, + "loss": 0.1615, + "step": 7780 + }, + { + "epoch": 0.124496, + "grad_norm": 0.515625, + "learning_rate": 8.825806451612903e-05, + "loss": 0.1543, + "step": 7781 + }, + { + "epoch": 0.124512, + "grad_norm": 0.6015625, + "learning_rate": 8.825645161290323e-05, + "loss": 0.1975, + "step": 7782 + }, + { + "epoch": 0.124528, + "grad_norm": 0.81640625, + "learning_rate": 8.825483870967742e-05, + "loss": 0.1688, + "step": 7783 + }, + { + "epoch": 0.124544, + "grad_norm": 0.65234375, + "learning_rate": 8.825322580645162e-05, + "loss": 0.1613, + "step": 7784 + }, + { + "epoch": 0.12456, + "grad_norm": 0.796875, + "learning_rate": 8.825161290322582e-05, + "loss": 0.1424, + "step": 7785 + }, + { + "epoch": 0.124576, + "grad_norm": 0.7109375, + "learning_rate": 8.825e-05, + "loss": 0.1888, + "step": 7786 + }, + { + "epoch": 0.124592, + "grad_norm": 0.9921875, + "learning_rate": 8.82483870967742e-05, + "loss": 0.2026, + "step": 7787 + }, + { + "epoch": 0.124608, + "grad_norm": 0.75390625, + "learning_rate": 8.824677419354839e-05, + "loss": 0.1546, + "step": 7788 + }, + { + "epoch": 0.124624, + "grad_norm": 0.86328125, + "learning_rate": 8.824516129032259e-05, + "loss": 0.1847, + "step": 7789 + }, + { + "epoch": 0.12464, + "grad_norm": 0.734375, + "learning_rate": 8.824354838709678e-05, + "loss": 0.2003, + "step": 7790 + }, + { + "epoch": 0.124656, + "grad_norm": 0.6171875, + "learning_rate": 8.824193548387097e-05, + "loss": 0.1685, + "step": 7791 + }, + { + "epoch": 0.124672, + "grad_norm": 0.7890625, + "learning_rate": 8.824032258064516e-05, + "loss": 0.1468, + "step": 7792 + }, + { + "epoch": 0.124688, + "grad_norm": 0.85546875, + "learning_rate": 8.823870967741936e-05, + "loss": 0.204, + "step": 7793 + }, + { + "epoch": 0.124704, + "grad_norm": 0.83203125, + "learning_rate": 8.823709677419355e-05, + "loss": 0.1822, + "step": 7794 + }, + { + "epoch": 0.12472, + "grad_norm": 0.65625, + "learning_rate": 8.823548387096775e-05, + "loss": 0.2051, + "step": 7795 + }, + { + "epoch": 0.124736, + "grad_norm": 0.765625, + "learning_rate": 8.823387096774193e-05, + "loss": 0.1667, + "step": 7796 + }, + { + "epoch": 0.124752, + "grad_norm": 0.70703125, + "learning_rate": 8.823225806451613e-05, + "loss": 0.1919, + "step": 7797 + }, + { + "epoch": 0.124768, + "grad_norm": 0.609375, + "learning_rate": 8.823064516129033e-05, + "loss": 0.2116, + "step": 7798 + }, + { + "epoch": 0.124784, + "grad_norm": 1.203125, + "learning_rate": 8.822903225806453e-05, + "loss": 0.1559, + "step": 7799 + }, + { + "epoch": 0.1248, + "grad_norm": 1.265625, + "learning_rate": 8.822741935483872e-05, + "loss": 0.1997, + "step": 7800 + }, + { + "epoch": 0.124816, + "grad_norm": 0.62890625, + "learning_rate": 8.82258064516129e-05, + "loss": 0.1765, + "step": 7801 + }, + { + "epoch": 0.124832, + "grad_norm": 1.015625, + "learning_rate": 8.82241935483871e-05, + "loss": 0.2147, + "step": 7802 + }, + { + "epoch": 0.124848, + "grad_norm": 0.6953125, + "learning_rate": 8.822258064516129e-05, + "loss": 0.2003, + "step": 7803 + }, + { + "epoch": 0.124864, + "grad_norm": 0.9296875, + "learning_rate": 8.822096774193549e-05, + "loss": 0.191, + "step": 7804 + }, + { + "epoch": 0.12488, + "grad_norm": 0.7578125, + "learning_rate": 8.821935483870967e-05, + "loss": 0.1907, + "step": 7805 + }, + { + "epoch": 0.124896, + "grad_norm": 0.6171875, + "learning_rate": 8.821774193548387e-05, + "loss": 0.1289, + "step": 7806 + }, + { + "epoch": 0.124912, + "grad_norm": 0.8671875, + "learning_rate": 8.821612903225806e-05, + "loss": 0.1807, + "step": 7807 + }, + { + "epoch": 0.124928, + "grad_norm": 0.609375, + "learning_rate": 8.821451612903226e-05, + "loss": 0.1431, + "step": 7808 + }, + { + "epoch": 0.124944, + "grad_norm": 1.5078125, + "learning_rate": 8.821290322580646e-05, + "loss": 0.2092, + "step": 7809 + }, + { + "epoch": 0.12496, + "grad_norm": 0.59765625, + "learning_rate": 8.821129032258066e-05, + "loss": 0.1395, + "step": 7810 + }, + { + "epoch": 0.124976, + "grad_norm": 0.81640625, + "learning_rate": 8.820967741935485e-05, + "loss": 0.1677, + "step": 7811 + }, + { + "epoch": 0.124992, + "grad_norm": 0.62890625, + "learning_rate": 8.820806451612905e-05, + "loss": 0.2011, + "step": 7812 + }, + { + "epoch": 0.125008, + "grad_norm": 1.0859375, + "learning_rate": 8.820645161290323e-05, + "loss": 0.156, + "step": 7813 + }, + { + "epoch": 0.125024, + "grad_norm": 1.3046875, + "learning_rate": 8.820483870967743e-05, + "loss": 0.1995, + "step": 7814 + }, + { + "epoch": 0.12504, + "grad_norm": 0.76171875, + "learning_rate": 8.820322580645162e-05, + "loss": 0.1697, + "step": 7815 + }, + { + "epoch": 0.125056, + "grad_norm": 0.75390625, + "learning_rate": 8.82016129032258e-05, + "loss": 0.146, + "step": 7816 + }, + { + "epoch": 0.125072, + "grad_norm": 1.3359375, + "learning_rate": 8.82e-05, + "loss": 0.2289, + "step": 7817 + }, + { + "epoch": 0.125088, + "grad_norm": 0.67578125, + "learning_rate": 8.819838709677419e-05, + "loss": 0.1764, + "step": 7818 + }, + { + "epoch": 0.125104, + "grad_norm": 0.81640625, + "learning_rate": 8.819677419354839e-05, + "loss": 0.1673, + "step": 7819 + }, + { + "epoch": 0.12512, + "grad_norm": 0.66796875, + "learning_rate": 8.819516129032257e-05, + "loss": 0.1522, + "step": 7820 + }, + { + "epoch": 0.125136, + "grad_norm": 0.67578125, + "learning_rate": 8.819354838709677e-05, + "loss": 0.1635, + "step": 7821 + }, + { + "epoch": 0.125152, + "grad_norm": 0.61328125, + "learning_rate": 8.819193548387097e-05, + "loss": 0.144, + "step": 7822 + }, + { + "epoch": 0.125168, + "grad_norm": 1.2265625, + "learning_rate": 8.819032258064517e-05, + "loss": 0.1912, + "step": 7823 + }, + { + "epoch": 0.125184, + "grad_norm": 0.75390625, + "learning_rate": 8.818870967741936e-05, + "loss": 0.1593, + "step": 7824 + }, + { + "epoch": 0.1252, + "grad_norm": 1.1640625, + "learning_rate": 8.818709677419356e-05, + "loss": 0.2176, + "step": 7825 + }, + { + "epoch": 0.125216, + "grad_norm": 0.75390625, + "learning_rate": 8.818548387096775e-05, + "loss": 0.1833, + "step": 7826 + }, + { + "epoch": 0.125232, + "grad_norm": 0.625, + "learning_rate": 8.818387096774194e-05, + "loss": 0.1663, + "step": 7827 + }, + { + "epoch": 0.125248, + "grad_norm": 0.55859375, + "learning_rate": 8.818225806451613e-05, + "loss": 0.0971, + "step": 7828 + }, + { + "epoch": 0.125264, + "grad_norm": 0.765625, + "learning_rate": 8.818064516129033e-05, + "loss": 0.1641, + "step": 7829 + }, + { + "epoch": 0.12528, + "grad_norm": 0.62109375, + "learning_rate": 8.817903225806452e-05, + "loss": 0.1659, + "step": 7830 + }, + { + "epoch": 0.125296, + "grad_norm": 0.59375, + "learning_rate": 8.81774193548387e-05, + "loss": 0.1413, + "step": 7831 + }, + { + "epoch": 0.125312, + "grad_norm": 0.86328125, + "learning_rate": 8.81758064516129e-05, + "loss": 0.1716, + "step": 7832 + }, + { + "epoch": 0.125328, + "grad_norm": 0.6484375, + "learning_rate": 8.81741935483871e-05, + "loss": 0.1858, + "step": 7833 + }, + { + "epoch": 0.125344, + "grad_norm": 0.4921875, + "learning_rate": 8.81725806451613e-05, + "loss": 0.14, + "step": 7834 + }, + { + "epoch": 0.12536, + "grad_norm": 1.1171875, + "learning_rate": 8.817096774193549e-05, + "loss": 0.1633, + "step": 7835 + }, + { + "epoch": 0.125376, + "grad_norm": 0.73046875, + "learning_rate": 8.816935483870969e-05, + "loss": 0.175, + "step": 7836 + }, + { + "epoch": 0.125392, + "grad_norm": 0.73828125, + "learning_rate": 8.816774193548387e-05, + "loss": 0.1863, + "step": 7837 + }, + { + "epoch": 0.125408, + "grad_norm": 0.8046875, + "learning_rate": 8.816612903225807e-05, + "loss": 0.1518, + "step": 7838 + }, + { + "epoch": 0.125424, + "grad_norm": 0.86328125, + "learning_rate": 8.816451612903226e-05, + "loss": 0.1639, + "step": 7839 + }, + { + "epoch": 0.12544, + "grad_norm": 0.73046875, + "learning_rate": 8.816290322580646e-05, + "loss": 0.1887, + "step": 7840 + }, + { + "epoch": 0.125456, + "grad_norm": 0.73828125, + "learning_rate": 8.816129032258064e-05, + "loss": 0.2165, + "step": 7841 + }, + { + "epoch": 0.125472, + "grad_norm": 0.7421875, + "learning_rate": 8.815967741935484e-05, + "loss": 0.1672, + "step": 7842 + }, + { + "epoch": 0.125488, + "grad_norm": 0.80859375, + "learning_rate": 8.815806451612903e-05, + "loss": 0.1927, + "step": 7843 + }, + { + "epoch": 0.125504, + "grad_norm": 0.62890625, + "learning_rate": 8.815645161290323e-05, + "loss": 0.1809, + "step": 7844 + }, + { + "epoch": 0.12552, + "grad_norm": 0.640625, + "learning_rate": 8.815483870967743e-05, + "loss": 0.134, + "step": 7845 + }, + { + "epoch": 0.125536, + "grad_norm": 0.76171875, + "learning_rate": 8.815322580645163e-05, + "loss": 0.175, + "step": 7846 + }, + { + "epoch": 0.125552, + "grad_norm": 0.89453125, + "learning_rate": 8.815161290322582e-05, + "loss": 0.1432, + "step": 7847 + }, + { + "epoch": 0.125568, + "grad_norm": 0.8515625, + "learning_rate": 8.815e-05, + "loss": 0.1788, + "step": 7848 + }, + { + "epoch": 0.125584, + "grad_norm": 0.80078125, + "learning_rate": 8.81483870967742e-05, + "loss": 0.1713, + "step": 7849 + }, + { + "epoch": 0.1256, + "grad_norm": 1.265625, + "learning_rate": 8.814677419354839e-05, + "loss": 0.1945, + "step": 7850 + }, + { + "epoch": 0.125616, + "grad_norm": 0.9140625, + "learning_rate": 8.814516129032259e-05, + "loss": 0.2105, + "step": 7851 + }, + { + "epoch": 0.125632, + "grad_norm": 0.66015625, + "learning_rate": 8.814354838709677e-05, + "loss": 0.1858, + "step": 7852 + }, + { + "epoch": 0.125648, + "grad_norm": 0.8046875, + "learning_rate": 8.814193548387097e-05, + "loss": 0.2026, + "step": 7853 + }, + { + "epoch": 0.125664, + "grad_norm": 0.69140625, + "learning_rate": 8.814032258064516e-05, + "loss": 0.17, + "step": 7854 + }, + { + "epoch": 0.12568, + "grad_norm": 0.80078125, + "learning_rate": 8.813870967741936e-05, + "loss": 0.2198, + "step": 7855 + }, + { + "epoch": 0.125696, + "grad_norm": 0.89453125, + "learning_rate": 8.813709677419354e-05, + "loss": 0.1863, + "step": 7856 + }, + { + "epoch": 0.125712, + "grad_norm": 1.375, + "learning_rate": 8.813548387096774e-05, + "loss": 0.1618, + "step": 7857 + }, + { + "epoch": 0.125728, + "grad_norm": 0.80078125, + "learning_rate": 8.813387096774194e-05, + "loss": 0.1722, + "step": 7858 + }, + { + "epoch": 0.125744, + "grad_norm": 0.51953125, + "learning_rate": 8.813225806451614e-05, + "loss": 0.1538, + "step": 7859 + }, + { + "epoch": 0.12576, + "grad_norm": 0.98046875, + "learning_rate": 8.813064516129033e-05, + "loss": 0.1768, + "step": 7860 + }, + { + "epoch": 0.125776, + "grad_norm": 1.2265625, + "learning_rate": 8.812903225806453e-05, + "loss": 0.1693, + "step": 7861 + }, + { + "epoch": 0.125792, + "grad_norm": 0.5390625, + "learning_rate": 8.812741935483871e-05, + "loss": 0.1733, + "step": 7862 + }, + { + "epoch": 0.125808, + "grad_norm": 0.68359375, + "learning_rate": 8.81258064516129e-05, + "loss": 0.1742, + "step": 7863 + }, + { + "epoch": 0.125824, + "grad_norm": 0.62890625, + "learning_rate": 8.81241935483871e-05, + "loss": 0.1739, + "step": 7864 + }, + { + "epoch": 0.12584, + "grad_norm": 0.82421875, + "learning_rate": 8.812258064516129e-05, + "loss": 0.1926, + "step": 7865 + }, + { + "epoch": 0.125856, + "grad_norm": 0.65234375, + "learning_rate": 8.812096774193549e-05, + "loss": 0.1647, + "step": 7866 + }, + { + "epoch": 0.125872, + "grad_norm": 0.7578125, + "learning_rate": 8.811935483870967e-05, + "loss": 0.2094, + "step": 7867 + }, + { + "epoch": 0.125888, + "grad_norm": 0.455078125, + "learning_rate": 8.811774193548387e-05, + "loss": 0.1454, + "step": 7868 + }, + { + "epoch": 0.125904, + "grad_norm": 0.8515625, + "learning_rate": 8.811612903225807e-05, + "loss": 0.1907, + "step": 7869 + }, + { + "epoch": 0.12592, + "grad_norm": 1.015625, + "learning_rate": 8.811451612903227e-05, + "loss": 0.1722, + "step": 7870 + }, + { + "epoch": 0.125936, + "grad_norm": 0.87109375, + "learning_rate": 8.811290322580646e-05, + "loss": 0.2148, + "step": 7871 + }, + { + "epoch": 0.125952, + "grad_norm": 0.6328125, + "learning_rate": 8.811129032258066e-05, + "loss": 0.1841, + "step": 7872 + }, + { + "epoch": 0.125968, + "grad_norm": 1.0703125, + "learning_rate": 8.810967741935484e-05, + "loss": 0.1859, + "step": 7873 + }, + { + "epoch": 0.125984, + "grad_norm": 0.84765625, + "learning_rate": 8.810806451612904e-05, + "loss": 0.1685, + "step": 7874 + }, + { + "epoch": 0.126, + "grad_norm": 0.58203125, + "learning_rate": 8.810645161290323e-05, + "loss": 0.1762, + "step": 7875 + }, + { + "epoch": 0.126016, + "grad_norm": 0.77734375, + "learning_rate": 8.810483870967743e-05, + "loss": 0.1677, + "step": 7876 + }, + { + "epoch": 0.126032, + "grad_norm": 0.50390625, + "learning_rate": 8.810322580645161e-05, + "loss": 0.149, + "step": 7877 + }, + { + "epoch": 0.126048, + "grad_norm": 0.75390625, + "learning_rate": 8.81016129032258e-05, + "loss": 0.1874, + "step": 7878 + }, + { + "epoch": 0.126064, + "grad_norm": 0.9921875, + "learning_rate": 8.81e-05, + "loss": 0.1969, + "step": 7879 + }, + { + "epoch": 0.12608, + "grad_norm": 0.76171875, + "learning_rate": 8.80983870967742e-05, + "loss": 0.1683, + "step": 7880 + }, + { + "epoch": 0.126096, + "grad_norm": 1.4140625, + "learning_rate": 8.809677419354839e-05, + "loss": 0.1662, + "step": 7881 + }, + { + "epoch": 0.126112, + "grad_norm": 0.83984375, + "learning_rate": 8.809516129032259e-05, + "loss": 0.1819, + "step": 7882 + }, + { + "epoch": 0.126128, + "grad_norm": 0.82421875, + "learning_rate": 8.809354838709679e-05, + "loss": 0.1488, + "step": 7883 + }, + { + "epoch": 0.126144, + "grad_norm": 0.84375, + "learning_rate": 8.809193548387097e-05, + "loss": 0.1696, + "step": 7884 + }, + { + "epoch": 0.12616, + "grad_norm": 0.9609375, + "learning_rate": 8.809032258064517e-05, + "loss": 0.1817, + "step": 7885 + }, + { + "epoch": 0.126176, + "grad_norm": 0.625, + "learning_rate": 8.808870967741936e-05, + "loss": 0.1568, + "step": 7886 + }, + { + "epoch": 0.126192, + "grad_norm": 0.6796875, + "learning_rate": 8.808709677419356e-05, + "loss": 0.2174, + "step": 7887 + }, + { + "epoch": 0.126208, + "grad_norm": 0.6640625, + "learning_rate": 8.808548387096774e-05, + "loss": 0.1638, + "step": 7888 + }, + { + "epoch": 0.126224, + "grad_norm": 0.796875, + "learning_rate": 8.808387096774194e-05, + "loss": 0.1956, + "step": 7889 + }, + { + "epoch": 0.12624, + "grad_norm": 0.88671875, + "learning_rate": 8.808225806451613e-05, + "loss": 0.1947, + "step": 7890 + }, + { + "epoch": 0.126256, + "grad_norm": 0.546875, + "learning_rate": 8.808064516129033e-05, + "loss": 0.1781, + "step": 7891 + }, + { + "epoch": 0.126272, + "grad_norm": 1.09375, + "learning_rate": 8.807903225806451e-05, + "loss": 0.1577, + "step": 7892 + }, + { + "epoch": 0.126288, + "grad_norm": 0.54296875, + "learning_rate": 8.807741935483871e-05, + "loss": 0.138, + "step": 7893 + }, + { + "epoch": 0.126304, + "grad_norm": 0.84765625, + "learning_rate": 8.807580645161291e-05, + "loss": 0.1933, + "step": 7894 + }, + { + "epoch": 0.12632, + "grad_norm": 0.85546875, + "learning_rate": 8.80741935483871e-05, + "loss": 0.1961, + "step": 7895 + }, + { + "epoch": 0.126336, + "grad_norm": 0.62890625, + "learning_rate": 8.80725806451613e-05, + "loss": 0.1592, + "step": 7896 + }, + { + "epoch": 0.126352, + "grad_norm": 1.0390625, + "learning_rate": 8.807096774193549e-05, + "loss": 0.2101, + "step": 7897 + }, + { + "epoch": 0.126368, + "grad_norm": 0.68359375, + "learning_rate": 8.806935483870968e-05, + "loss": 0.1686, + "step": 7898 + }, + { + "epoch": 0.126384, + "grad_norm": 0.640625, + "learning_rate": 8.806774193548387e-05, + "loss": 0.1745, + "step": 7899 + }, + { + "epoch": 0.1264, + "grad_norm": 0.6640625, + "learning_rate": 8.806612903225807e-05, + "loss": 0.1578, + "step": 7900 + }, + { + "epoch": 0.126416, + "grad_norm": 1.0078125, + "learning_rate": 8.806451612903226e-05, + "loss": 0.149, + "step": 7901 + }, + { + "epoch": 0.126432, + "grad_norm": 1.0, + "learning_rate": 8.806290322580646e-05, + "loss": 0.1902, + "step": 7902 + }, + { + "epoch": 0.126448, + "grad_norm": 1.1484375, + "learning_rate": 8.806129032258064e-05, + "loss": 0.2033, + "step": 7903 + }, + { + "epoch": 0.126464, + "grad_norm": 0.7578125, + "learning_rate": 8.805967741935484e-05, + "loss": 0.1512, + "step": 7904 + }, + { + "epoch": 0.12648, + "grad_norm": 0.73828125, + "learning_rate": 8.805806451612904e-05, + "loss": 0.1796, + "step": 7905 + }, + { + "epoch": 0.126496, + "grad_norm": 1.046875, + "learning_rate": 8.805645161290324e-05, + "loss": 0.1841, + "step": 7906 + }, + { + "epoch": 0.126512, + "grad_norm": 0.53515625, + "learning_rate": 8.805483870967743e-05, + "loss": 0.1197, + "step": 7907 + }, + { + "epoch": 0.126528, + "grad_norm": 1.078125, + "learning_rate": 8.805322580645163e-05, + "loss": 0.1772, + "step": 7908 + }, + { + "epoch": 0.126544, + "grad_norm": 0.66015625, + "learning_rate": 8.805161290322581e-05, + "loss": 0.1773, + "step": 7909 + }, + { + "epoch": 0.12656, + "grad_norm": 1.1484375, + "learning_rate": 8.805e-05, + "loss": 0.2046, + "step": 7910 + }, + { + "epoch": 0.126576, + "grad_norm": 0.8515625, + "learning_rate": 8.80483870967742e-05, + "loss": 0.2008, + "step": 7911 + }, + { + "epoch": 0.126592, + "grad_norm": 0.75390625, + "learning_rate": 8.804677419354838e-05, + "loss": 0.1708, + "step": 7912 + }, + { + "epoch": 0.126608, + "grad_norm": 0.9921875, + "learning_rate": 8.804516129032258e-05, + "loss": 0.1922, + "step": 7913 + }, + { + "epoch": 0.126624, + "grad_norm": 1.1796875, + "learning_rate": 8.804354838709677e-05, + "loss": 0.1375, + "step": 7914 + }, + { + "epoch": 0.12664, + "grad_norm": 1.2578125, + "learning_rate": 8.804193548387097e-05, + "loss": 0.1652, + "step": 7915 + }, + { + "epoch": 0.126656, + "grad_norm": 1.5703125, + "learning_rate": 8.804032258064516e-05, + "loss": 0.1751, + "step": 7916 + }, + { + "epoch": 0.126672, + "grad_norm": 0.69140625, + "learning_rate": 8.803870967741936e-05, + "loss": 0.1634, + "step": 7917 + }, + { + "epoch": 0.126688, + "grad_norm": 1.0703125, + "learning_rate": 8.803709677419356e-05, + "loss": 0.1612, + "step": 7918 + }, + { + "epoch": 0.126704, + "grad_norm": 1.3828125, + "learning_rate": 8.803548387096775e-05, + "loss": 0.1723, + "step": 7919 + }, + { + "epoch": 0.12672, + "grad_norm": 0.796875, + "learning_rate": 8.803387096774194e-05, + "loss": 0.1702, + "step": 7920 + }, + { + "epoch": 0.126736, + "grad_norm": 0.96875, + "learning_rate": 8.803225806451614e-05, + "loss": 0.1537, + "step": 7921 + }, + { + "epoch": 0.126752, + "grad_norm": 0.96875, + "learning_rate": 8.803064516129033e-05, + "loss": 0.1355, + "step": 7922 + }, + { + "epoch": 0.126768, + "grad_norm": 1.1953125, + "learning_rate": 8.802903225806453e-05, + "loss": 0.1619, + "step": 7923 + }, + { + "epoch": 0.126784, + "grad_norm": 0.625, + "learning_rate": 8.802741935483871e-05, + "loss": 0.1628, + "step": 7924 + }, + { + "epoch": 0.1268, + "grad_norm": 1.125, + "learning_rate": 8.80258064516129e-05, + "loss": 0.1775, + "step": 7925 + }, + { + "epoch": 0.126816, + "grad_norm": 0.87890625, + "learning_rate": 8.80241935483871e-05, + "loss": 0.1413, + "step": 7926 + }, + { + "epoch": 0.126832, + "grad_norm": 0.69140625, + "learning_rate": 8.802258064516128e-05, + "loss": 0.2017, + "step": 7927 + }, + { + "epoch": 0.126848, + "grad_norm": 0.92578125, + "learning_rate": 8.802096774193548e-05, + "loss": 0.1523, + "step": 7928 + }, + { + "epoch": 0.126864, + "grad_norm": 1.15625, + "learning_rate": 8.801935483870968e-05, + "loss": 0.2202, + "step": 7929 + }, + { + "epoch": 0.12688, + "grad_norm": 0.96484375, + "learning_rate": 8.801774193548388e-05, + "loss": 0.1899, + "step": 7930 + }, + { + "epoch": 0.126896, + "grad_norm": 0.7890625, + "learning_rate": 8.801612903225807e-05, + "loss": 0.1835, + "step": 7931 + }, + { + "epoch": 0.126912, + "grad_norm": 0.80078125, + "learning_rate": 8.801451612903227e-05, + "loss": 0.1542, + "step": 7932 + }, + { + "epoch": 0.126928, + "grad_norm": 0.7421875, + "learning_rate": 8.801290322580645e-05, + "loss": 0.146, + "step": 7933 + }, + { + "epoch": 0.126944, + "grad_norm": 0.83203125, + "learning_rate": 8.801129032258065e-05, + "loss": 0.2057, + "step": 7934 + }, + { + "epoch": 0.12696, + "grad_norm": 0.6640625, + "learning_rate": 8.800967741935484e-05, + "loss": 0.1943, + "step": 7935 + }, + { + "epoch": 0.126976, + "grad_norm": 1.1484375, + "learning_rate": 8.800806451612904e-05, + "loss": 0.1943, + "step": 7936 + }, + { + "epoch": 0.126992, + "grad_norm": 0.7265625, + "learning_rate": 8.800645161290323e-05, + "loss": 0.1825, + "step": 7937 + }, + { + "epoch": 0.127008, + "grad_norm": 0.51171875, + "learning_rate": 8.800483870967743e-05, + "loss": 0.181, + "step": 7938 + }, + { + "epoch": 0.127024, + "grad_norm": 0.7734375, + "learning_rate": 8.800322580645161e-05, + "loss": 0.184, + "step": 7939 + }, + { + "epoch": 0.12704, + "grad_norm": 0.6640625, + "learning_rate": 8.800161290322581e-05, + "loss": 0.1494, + "step": 7940 + }, + { + "epoch": 0.127056, + "grad_norm": 0.58203125, + "learning_rate": 8.800000000000001e-05, + "loss": 0.1719, + "step": 7941 + }, + { + "epoch": 0.127072, + "grad_norm": 0.51171875, + "learning_rate": 8.79983870967742e-05, + "loss": 0.1447, + "step": 7942 + }, + { + "epoch": 0.127088, + "grad_norm": 0.5703125, + "learning_rate": 8.79967741935484e-05, + "loss": 0.1506, + "step": 7943 + }, + { + "epoch": 0.127104, + "grad_norm": 0.65234375, + "learning_rate": 8.799516129032258e-05, + "loss": 0.1515, + "step": 7944 + }, + { + "epoch": 0.12712, + "grad_norm": 0.96484375, + "learning_rate": 8.799354838709678e-05, + "loss": 0.1731, + "step": 7945 + }, + { + "epoch": 0.127136, + "grad_norm": 0.953125, + "learning_rate": 8.799193548387097e-05, + "loss": 0.1857, + "step": 7946 + }, + { + "epoch": 0.127152, + "grad_norm": 0.984375, + "learning_rate": 8.799032258064517e-05, + "loss": 0.2238, + "step": 7947 + }, + { + "epoch": 0.127168, + "grad_norm": 1.3046875, + "learning_rate": 8.798870967741935e-05, + "loss": 0.2362, + "step": 7948 + }, + { + "epoch": 0.127184, + "grad_norm": 1.15625, + "learning_rate": 8.798709677419355e-05, + "loss": 0.1705, + "step": 7949 + }, + { + "epoch": 0.1272, + "grad_norm": 0.43359375, + "learning_rate": 8.798548387096774e-05, + "loss": 0.1501, + "step": 7950 + }, + { + "epoch": 0.127216, + "grad_norm": 0.765625, + "learning_rate": 8.798387096774194e-05, + "loss": 0.1344, + "step": 7951 + }, + { + "epoch": 0.127232, + "grad_norm": 1.2265625, + "learning_rate": 8.798225806451613e-05, + "loss": 0.182, + "step": 7952 + }, + { + "epoch": 0.127248, + "grad_norm": 0.875, + "learning_rate": 8.798064516129033e-05, + "loss": 0.1564, + "step": 7953 + }, + { + "epoch": 0.127264, + "grad_norm": 1.15625, + "learning_rate": 8.797903225806453e-05, + "loss": 0.1654, + "step": 7954 + }, + { + "epoch": 0.12728, + "grad_norm": 1.1796875, + "learning_rate": 8.797741935483872e-05, + "loss": 0.1656, + "step": 7955 + }, + { + "epoch": 0.127296, + "grad_norm": 0.73828125, + "learning_rate": 8.797580645161291e-05, + "loss": 0.1889, + "step": 7956 + }, + { + "epoch": 0.127312, + "grad_norm": 0.87890625, + "learning_rate": 8.79741935483871e-05, + "loss": 0.1878, + "step": 7957 + }, + { + "epoch": 0.127328, + "grad_norm": 0.6796875, + "learning_rate": 8.79725806451613e-05, + "loss": 0.1794, + "step": 7958 + }, + { + "epoch": 0.127344, + "grad_norm": 0.94921875, + "learning_rate": 8.797096774193548e-05, + "loss": 0.1808, + "step": 7959 + }, + { + "epoch": 0.12736, + "grad_norm": 0.89453125, + "learning_rate": 8.796935483870968e-05, + "loss": 0.1854, + "step": 7960 + }, + { + "epoch": 0.127376, + "grad_norm": 0.890625, + "learning_rate": 8.796774193548387e-05, + "loss": 0.1534, + "step": 7961 + }, + { + "epoch": 0.127392, + "grad_norm": 0.7265625, + "learning_rate": 8.796612903225807e-05, + "loss": 0.2009, + "step": 7962 + }, + { + "epoch": 0.127408, + "grad_norm": 1.140625, + "learning_rate": 8.796451612903225e-05, + "loss": 0.1589, + "step": 7963 + }, + { + "epoch": 0.127424, + "grad_norm": 1.2421875, + "learning_rate": 8.796290322580645e-05, + "loss": 0.1752, + "step": 7964 + }, + { + "epoch": 0.12744, + "grad_norm": 0.734375, + "learning_rate": 8.796129032258065e-05, + "loss": 0.1557, + "step": 7965 + }, + { + "epoch": 0.127456, + "grad_norm": 0.90234375, + "learning_rate": 8.795967741935485e-05, + "loss": 0.1709, + "step": 7966 + }, + { + "epoch": 0.127472, + "grad_norm": 0.8203125, + "learning_rate": 8.795806451612904e-05, + "loss": 0.1816, + "step": 7967 + }, + { + "epoch": 0.127488, + "grad_norm": 0.9375, + "learning_rate": 8.795645161290324e-05, + "loss": 0.1943, + "step": 7968 + }, + { + "epoch": 0.127504, + "grad_norm": 0.70703125, + "learning_rate": 8.795483870967742e-05, + "loss": 0.1599, + "step": 7969 + }, + { + "epoch": 0.12752, + "grad_norm": 0.59375, + "learning_rate": 8.795322580645162e-05, + "loss": 0.1272, + "step": 7970 + }, + { + "epoch": 0.127536, + "grad_norm": 1.46875, + "learning_rate": 8.795161290322581e-05, + "loss": 0.193, + "step": 7971 + }, + { + "epoch": 0.127552, + "grad_norm": 0.55859375, + "learning_rate": 8.795e-05, + "loss": 0.1386, + "step": 7972 + }, + { + "epoch": 0.127568, + "grad_norm": 0.77734375, + "learning_rate": 8.79483870967742e-05, + "loss": 0.1893, + "step": 7973 + }, + { + "epoch": 0.127584, + "grad_norm": 0.7421875, + "learning_rate": 8.794677419354838e-05, + "loss": 0.1931, + "step": 7974 + }, + { + "epoch": 0.1276, + "grad_norm": 0.64453125, + "learning_rate": 8.794516129032258e-05, + "loss": 0.1722, + "step": 7975 + }, + { + "epoch": 0.127616, + "grad_norm": 0.8828125, + "learning_rate": 8.794354838709677e-05, + "loss": 0.2186, + "step": 7976 + }, + { + "epoch": 0.127632, + "grad_norm": 0.81640625, + "learning_rate": 8.794193548387097e-05, + "loss": 0.1582, + "step": 7977 + }, + { + "epoch": 0.127648, + "grad_norm": 0.59765625, + "learning_rate": 8.794032258064517e-05, + "loss": 0.1628, + "step": 7978 + }, + { + "epoch": 0.127664, + "grad_norm": 0.5390625, + "learning_rate": 8.793870967741937e-05, + "loss": 0.1406, + "step": 7979 + }, + { + "epoch": 0.12768, + "grad_norm": 0.55078125, + "learning_rate": 8.793709677419355e-05, + "loss": 0.196, + "step": 7980 + }, + { + "epoch": 0.127696, + "grad_norm": 1.109375, + "learning_rate": 8.793548387096775e-05, + "loss": 0.2088, + "step": 7981 + }, + { + "epoch": 0.127712, + "grad_norm": 0.99609375, + "learning_rate": 8.793387096774194e-05, + "loss": 0.2178, + "step": 7982 + }, + { + "epoch": 0.127728, + "grad_norm": 0.9765625, + "learning_rate": 8.793225806451614e-05, + "loss": 0.1903, + "step": 7983 + }, + { + "epoch": 0.127744, + "grad_norm": 0.8359375, + "learning_rate": 8.793064516129032e-05, + "loss": 0.235, + "step": 7984 + }, + { + "epoch": 0.12776, + "grad_norm": 0.9453125, + "learning_rate": 8.792903225806452e-05, + "loss": 0.1725, + "step": 7985 + }, + { + "epoch": 0.127776, + "grad_norm": 1.171875, + "learning_rate": 8.792741935483871e-05, + "loss": 0.2348, + "step": 7986 + }, + { + "epoch": 0.127792, + "grad_norm": 1.46875, + "learning_rate": 8.79258064516129e-05, + "loss": 0.1729, + "step": 7987 + }, + { + "epoch": 0.127808, + "grad_norm": 0.58203125, + "learning_rate": 8.79241935483871e-05, + "loss": 0.1496, + "step": 7988 + }, + { + "epoch": 0.127824, + "grad_norm": 0.921875, + "learning_rate": 8.79225806451613e-05, + "loss": 0.1995, + "step": 7989 + }, + { + "epoch": 0.12784, + "grad_norm": 0.80859375, + "learning_rate": 8.79209677419355e-05, + "loss": 0.154, + "step": 7990 + }, + { + "epoch": 0.127856, + "grad_norm": 0.6875, + "learning_rate": 8.791935483870968e-05, + "loss": 0.17, + "step": 7991 + }, + { + "epoch": 0.127872, + "grad_norm": 0.8515625, + "learning_rate": 8.791774193548388e-05, + "loss": 0.1933, + "step": 7992 + }, + { + "epoch": 0.127888, + "grad_norm": 1.1171875, + "learning_rate": 8.791612903225807e-05, + "loss": 0.1652, + "step": 7993 + }, + { + "epoch": 0.127904, + "grad_norm": 1.4296875, + "learning_rate": 8.791451612903227e-05, + "loss": 0.1709, + "step": 7994 + }, + { + "epoch": 0.12792, + "grad_norm": 0.671875, + "learning_rate": 8.791290322580645e-05, + "loss": 0.1381, + "step": 7995 + }, + { + "epoch": 0.127936, + "grad_norm": 0.66015625, + "learning_rate": 8.791129032258065e-05, + "loss": 0.1497, + "step": 7996 + }, + { + "epoch": 0.127952, + "grad_norm": 0.96484375, + "learning_rate": 8.790967741935484e-05, + "loss": 0.1478, + "step": 7997 + }, + { + "epoch": 0.127968, + "grad_norm": 0.8671875, + "learning_rate": 8.790806451612904e-05, + "loss": 0.1802, + "step": 7998 + }, + { + "epoch": 0.127984, + "grad_norm": 0.90234375, + "learning_rate": 8.790645161290322e-05, + "loss": 0.1966, + "step": 7999 + }, + { + "epoch": 0.128, + "grad_norm": 1.5703125, + "learning_rate": 8.790483870967742e-05, + "loss": 0.1691, + "step": 8000 + }, + { + "epoch": 0.128016, + "grad_norm": 0.67578125, + "learning_rate": 8.790322580645162e-05, + "loss": 0.1321, + "step": 8001 + }, + { + "epoch": 0.128032, + "grad_norm": 0.70703125, + "learning_rate": 8.790161290322582e-05, + "loss": 0.1484, + "step": 8002 + }, + { + "epoch": 0.128048, + "grad_norm": 1.1328125, + "learning_rate": 8.790000000000001e-05, + "loss": 0.1851, + "step": 8003 + }, + { + "epoch": 0.128064, + "grad_norm": 0.796875, + "learning_rate": 8.78983870967742e-05, + "loss": 0.1817, + "step": 8004 + }, + { + "epoch": 0.12808, + "grad_norm": 0.6640625, + "learning_rate": 8.78967741935484e-05, + "loss": 0.1629, + "step": 8005 + }, + { + "epoch": 0.128096, + "grad_norm": 0.765625, + "learning_rate": 8.789516129032258e-05, + "loss": 0.2017, + "step": 8006 + }, + { + "epoch": 0.128112, + "grad_norm": 0.98828125, + "learning_rate": 8.789354838709678e-05, + "loss": 0.21, + "step": 8007 + }, + { + "epoch": 0.128128, + "grad_norm": 0.76171875, + "learning_rate": 8.789193548387097e-05, + "loss": 0.1907, + "step": 8008 + }, + { + "epoch": 0.128144, + "grad_norm": 1.421875, + "learning_rate": 8.789032258064517e-05, + "loss": 0.1981, + "step": 8009 + }, + { + "epoch": 0.12816, + "grad_norm": 0.6796875, + "learning_rate": 8.788870967741935e-05, + "loss": 0.2068, + "step": 8010 + }, + { + "epoch": 0.128176, + "grad_norm": 0.75, + "learning_rate": 8.788709677419355e-05, + "loss": 0.1727, + "step": 8011 + }, + { + "epoch": 0.128192, + "grad_norm": 0.65625, + "learning_rate": 8.788548387096774e-05, + "loss": 0.1842, + "step": 8012 + }, + { + "epoch": 0.128208, + "grad_norm": 0.81640625, + "learning_rate": 8.788387096774194e-05, + "loss": 0.1486, + "step": 8013 + }, + { + "epoch": 0.128224, + "grad_norm": 1.046875, + "learning_rate": 8.788225806451614e-05, + "loss": 0.1792, + "step": 8014 + }, + { + "epoch": 0.12824, + "grad_norm": 0.66796875, + "learning_rate": 8.788064516129034e-05, + "loss": 0.1658, + "step": 8015 + }, + { + "epoch": 0.128256, + "grad_norm": 0.546875, + "learning_rate": 8.787903225806452e-05, + "loss": 0.1525, + "step": 8016 + }, + { + "epoch": 0.128272, + "grad_norm": 0.78515625, + "learning_rate": 8.787741935483872e-05, + "loss": 0.201, + "step": 8017 + }, + { + "epoch": 0.128288, + "grad_norm": 0.859375, + "learning_rate": 8.787580645161291e-05, + "loss": 0.1691, + "step": 8018 + }, + { + "epoch": 0.128304, + "grad_norm": 0.625, + "learning_rate": 8.78741935483871e-05, + "loss": 0.1416, + "step": 8019 + }, + { + "epoch": 0.12832, + "grad_norm": 0.8203125, + "learning_rate": 8.78725806451613e-05, + "loss": 0.2343, + "step": 8020 + }, + { + "epoch": 0.128336, + "grad_norm": 1.0234375, + "learning_rate": 8.787096774193548e-05, + "loss": 0.1478, + "step": 8021 + }, + { + "epoch": 0.128352, + "grad_norm": 1.234375, + "learning_rate": 8.786935483870968e-05, + "loss": 0.1812, + "step": 8022 + }, + { + "epoch": 0.128368, + "grad_norm": 0.9375, + "learning_rate": 8.786774193548387e-05, + "loss": 0.1821, + "step": 8023 + }, + { + "epoch": 0.128384, + "grad_norm": 0.78125, + "learning_rate": 8.786612903225807e-05, + "loss": 0.1911, + "step": 8024 + }, + { + "epoch": 0.1284, + "grad_norm": 0.69921875, + "learning_rate": 8.786451612903227e-05, + "loss": 0.1698, + "step": 8025 + }, + { + "epoch": 0.128416, + "grad_norm": 1.1015625, + "learning_rate": 8.786290322580646e-05, + "loss": 0.209, + "step": 8026 + }, + { + "epoch": 0.128432, + "grad_norm": 0.77734375, + "learning_rate": 8.786129032258065e-05, + "loss": 0.171, + "step": 8027 + }, + { + "epoch": 0.128448, + "grad_norm": 0.6015625, + "learning_rate": 8.785967741935485e-05, + "loss": 0.1476, + "step": 8028 + }, + { + "epoch": 0.128464, + "grad_norm": 1.40625, + "learning_rate": 8.785806451612904e-05, + "loss": 0.1954, + "step": 8029 + }, + { + "epoch": 0.12848, + "grad_norm": 0.76171875, + "learning_rate": 8.785645161290324e-05, + "loss": 0.1965, + "step": 8030 + }, + { + "epoch": 0.128496, + "grad_norm": 0.68359375, + "learning_rate": 8.785483870967742e-05, + "loss": 0.1546, + "step": 8031 + }, + { + "epoch": 0.128512, + "grad_norm": 0.9609375, + "learning_rate": 8.785322580645162e-05, + "loss": 0.2037, + "step": 8032 + }, + { + "epoch": 0.128528, + "grad_norm": 0.9375, + "learning_rate": 8.785161290322581e-05, + "loss": 0.1454, + "step": 8033 + }, + { + "epoch": 0.128544, + "grad_norm": 1.171875, + "learning_rate": 8.785e-05, + "loss": 0.165, + "step": 8034 + }, + { + "epoch": 0.12856, + "grad_norm": 0.71875, + "learning_rate": 8.78483870967742e-05, + "loss": 0.1378, + "step": 8035 + }, + { + "epoch": 0.128576, + "grad_norm": 0.7734375, + "learning_rate": 8.78467741935484e-05, + "loss": 0.1525, + "step": 8036 + }, + { + "epoch": 0.128592, + "grad_norm": 0.58984375, + "learning_rate": 8.784516129032259e-05, + "loss": 0.1307, + "step": 8037 + }, + { + "epoch": 0.128608, + "grad_norm": 0.9140625, + "learning_rate": 8.784354838709678e-05, + "loss": 0.1621, + "step": 8038 + }, + { + "epoch": 0.128624, + "grad_norm": 0.82421875, + "learning_rate": 8.784193548387098e-05, + "loss": 0.1783, + "step": 8039 + }, + { + "epoch": 0.12864, + "grad_norm": 0.6328125, + "learning_rate": 8.784032258064516e-05, + "loss": 0.1713, + "step": 8040 + }, + { + "epoch": 0.128656, + "grad_norm": 1.1171875, + "learning_rate": 8.783870967741936e-05, + "loss": 0.1743, + "step": 8041 + }, + { + "epoch": 0.128672, + "grad_norm": 0.671875, + "learning_rate": 8.783709677419355e-05, + "loss": 0.1783, + "step": 8042 + }, + { + "epoch": 0.128688, + "grad_norm": 1.28125, + "learning_rate": 8.783548387096775e-05, + "loss": 0.2195, + "step": 8043 + }, + { + "epoch": 0.128704, + "grad_norm": 0.7734375, + "learning_rate": 8.783387096774194e-05, + "loss": 0.1425, + "step": 8044 + }, + { + "epoch": 0.12872, + "grad_norm": 1.015625, + "learning_rate": 8.783225806451614e-05, + "loss": 0.1907, + "step": 8045 + }, + { + "epoch": 0.128736, + "grad_norm": 0.796875, + "learning_rate": 8.783064516129032e-05, + "loss": 0.1722, + "step": 8046 + }, + { + "epoch": 0.128752, + "grad_norm": 0.94921875, + "learning_rate": 8.782903225806452e-05, + "loss": 0.1901, + "step": 8047 + }, + { + "epoch": 0.128768, + "grad_norm": 0.73046875, + "learning_rate": 8.782741935483871e-05, + "loss": 0.1709, + "step": 8048 + }, + { + "epoch": 0.128784, + "grad_norm": 0.8359375, + "learning_rate": 8.782580645161291e-05, + "loss": 0.1798, + "step": 8049 + }, + { + "epoch": 0.1288, + "grad_norm": 1.2578125, + "learning_rate": 8.782419354838711e-05, + "loss": 0.1887, + "step": 8050 + }, + { + "epoch": 0.128816, + "grad_norm": 1.203125, + "learning_rate": 8.782258064516129e-05, + "loss": 0.2193, + "step": 8051 + }, + { + "epoch": 0.128832, + "grad_norm": 0.515625, + "learning_rate": 8.782096774193549e-05, + "loss": 0.1407, + "step": 8052 + }, + { + "epoch": 0.128848, + "grad_norm": 0.68359375, + "learning_rate": 8.781935483870968e-05, + "loss": 0.1803, + "step": 8053 + }, + { + "epoch": 0.128864, + "grad_norm": 1.1875, + "learning_rate": 8.781774193548388e-05, + "loss": 0.1606, + "step": 8054 + }, + { + "epoch": 0.12888, + "grad_norm": 0.6328125, + "learning_rate": 8.781612903225806e-05, + "loss": 0.1904, + "step": 8055 + }, + { + "epoch": 0.128896, + "grad_norm": 0.6953125, + "learning_rate": 8.781451612903226e-05, + "loss": 0.1851, + "step": 8056 + }, + { + "epoch": 0.128912, + "grad_norm": 1.0859375, + "learning_rate": 8.781290322580645e-05, + "loss": 0.2105, + "step": 8057 + }, + { + "epoch": 0.128928, + "grad_norm": 0.59375, + "learning_rate": 8.781129032258065e-05, + "loss": 0.1649, + "step": 8058 + }, + { + "epoch": 0.128944, + "grad_norm": 0.9453125, + "learning_rate": 8.780967741935484e-05, + "loss": 0.2085, + "step": 8059 + }, + { + "epoch": 0.12896, + "grad_norm": 0.5390625, + "learning_rate": 8.780806451612904e-05, + "loss": 0.17, + "step": 8060 + }, + { + "epoch": 0.128976, + "grad_norm": 0.87890625, + "learning_rate": 8.780645161290323e-05, + "loss": 0.1899, + "step": 8061 + }, + { + "epoch": 0.128992, + "grad_norm": 0.796875, + "learning_rate": 8.780483870967743e-05, + "loss": 0.2164, + "step": 8062 + }, + { + "epoch": 0.129008, + "grad_norm": 1.1328125, + "learning_rate": 8.780322580645162e-05, + "loss": 0.2045, + "step": 8063 + }, + { + "epoch": 0.129024, + "grad_norm": 0.80078125, + "learning_rate": 8.780161290322582e-05, + "loss": 0.2167, + "step": 8064 + }, + { + "epoch": 0.12904, + "grad_norm": 0.6953125, + "learning_rate": 8.78e-05, + "loss": 0.1698, + "step": 8065 + }, + { + "epoch": 0.129056, + "grad_norm": 0.59375, + "learning_rate": 8.779838709677419e-05, + "loss": 0.1783, + "step": 8066 + }, + { + "epoch": 0.129072, + "grad_norm": 0.51953125, + "learning_rate": 8.779677419354839e-05, + "loss": 0.1729, + "step": 8067 + }, + { + "epoch": 0.129088, + "grad_norm": 0.96875, + "learning_rate": 8.779516129032258e-05, + "loss": 0.1915, + "step": 8068 + }, + { + "epoch": 0.129104, + "grad_norm": 0.59375, + "learning_rate": 8.779354838709678e-05, + "loss": 0.1689, + "step": 8069 + }, + { + "epoch": 0.12912, + "grad_norm": 0.6640625, + "learning_rate": 8.779193548387096e-05, + "loss": 0.164, + "step": 8070 + }, + { + "epoch": 0.129136, + "grad_norm": 0.578125, + "learning_rate": 8.779032258064516e-05, + "loss": 0.1278, + "step": 8071 + }, + { + "epoch": 0.129152, + "grad_norm": 1.0, + "learning_rate": 8.778870967741935e-05, + "loss": 0.1418, + "step": 8072 + }, + { + "epoch": 0.129168, + "grad_norm": 0.7734375, + "learning_rate": 8.778709677419355e-05, + "loss": 0.218, + "step": 8073 + }, + { + "epoch": 0.129184, + "grad_norm": 0.74609375, + "learning_rate": 8.778548387096775e-05, + "loss": 0.1524, + "step": 8074 + }, + { + "epoch": 0.1292, + "grad_norm": 0.91796875, + "learning_rate": 8.778387096774195e-05, + "loss": 0.1783, + "step": 8075 + }, + { + "epoch": 0.129216, + "grad_norm": 0.828125, + "learning_rate": 8.778225806451613e-05, + "loss": 0.198, + "step": 8076 + }, + { + "epoch": 0.129232, + "grad_norm": 0.984375, + "learning_rate": 8.778064516129033e-05, + "loss": 0.216, + "step": 8077 + }, + { + "epoch": 0.129248, + "grad_norm": 0.9609375, + "learning_rate": 8.777903225806452e-05, + "loss": 0.1765, + "step": 8078 + }, + { + "epoch": 0.129264, + "grad_norm": 1.0859375, + "learning_rate": 8.777741935483872e-05, + "loss": 0.2083, + "step": 8079 + }, + { + "epoch": 0.12928, + "grad_norm": 0.71875, + "learning_rate": 8.77758064516129e-05, + "loss": 0.1571, + "step": 8080 + }, + { + "epoch": 0.129296, + "grad_norm": 0.921875, + "learning_rate": 8.777419354838709e-05, + "loss": 0.1638, + "step": 8081 + }, + { + "epoch": 0.129312, + "grad_norm": 1.3046875, + "learning_rate": 8.777258064516129e-05, + "loss": 0.1701, + "step": 8082 + }, + { + "epoch": 0.129328, + "grad_norm": 0.76171875, + "learning_rate": 8.777096774193548e-05, + "loss": 0.201, + "step": 8083 + }, + { + "epoch": 0.129344, + "grad_norm": 0.81640625, + "learning_rate": 8.776935483870968e-05, + "loss": 0.1365, + "step": 8084 + }, + { + "epoch": 0.12936, + "grad_norm": 0.703125, + "learning_rate": 8.776774193548388e-05, + "loss": 0.1413, + "step": 8085 + }, + { + "epoch": 0.129376, + "grad_norm": 1.234375, + "learning_rate": 8.776612903225808e-05, + "loss": 0.1515, + "step": 8086 + }, + { + "epoch": 0.129392, + "grad_norm": 0.70703125, + "learning_rate": 8.776451612903226e-05, + "loss": 0.1678, + "step": 8087 + }, + { + "epoch": 0.129408, + "grad_norm": 0.671875, + "learning_rate": 8.776290322580646e-05, + "loss": 0.186, + "step": 8088 + }, + { + "epoch": 0.129424, + "grad_norm": 1.296875, + "learning_rate": 8.776129032258065e-05, + "loss": 0.211, + "step": 8089 + }, + { + "epoch": 0.12944, + "grad_norm": 1.0390625, + "learning_rate": 8.775967741935485e-05, + "loss": 0.1914, + "step": 8090 + }, + { + "epoch": 0.129456, + "grad_norm": 0.55078125, + "learning_rate": 8.775806451612903e-05, + "loss": 0.1392, + "step": 8091 + }, + { + "epoch": 0.129472, + "grad_norm": 1.078125, + "learning_rate": 8.775645161290323e-05, + "loss": 0.1489, + "step": 8092 + }, + { + "epoch": 0.129488, + "grad_norm": 0.6015625, + "learning_rate": 8.775483870967742e-05, + "loss": 0.1662, + "step": 8093 + }, + { + "epoch": 0.129504, + "grad_norm": 0.61328125, + "learning_rate": 8.775322580645162e-05, + "loss": 0.1455, + "step": 8094 + }, + { + "epoch": 0.12952, + "grad_norm": 0.8203125, + "learning_rate": 8.77516129032258e-05, + "loss": 0.1633, + "step": 8095 + }, + { + "epoch": 0.129536, + "grad_norm": 0.83203125, + "learning_rate": 8.775e-05, + "loss": 0.2179, + "step": 8096 + }, + { + "epoch": 0.129552, + "grad_norm": 0.96875, + "learning_rate": 8.77483870967742e-05, + "loss": 0.1725, + "step": 8097 + }, + { + "epoch": 0.129568, + "grad_norm": 0.890625, + "learning_rate": 8.774677419354839e-05, + "loss": 0.1704, + "step": 8098 + }, + { + "epoch": 0.129584, + "grad_norm": 0.8828125, + "learning_rate": 8.774516129032259e-05, + "loss": 0.2, + "step": 8099 + }, + { + "epoch": 0.1296, + "grad_norm": 0.63671875, + "learning_rate": 8.774354838709678e-05, + "loss": 0.1464, + "step": 8100 + }, + { + "epoch": 0.129616, + "grad_norm": 1.03125, + "learning_rate": 8.774193548387098e-05, + "loss": 0.1647, + "step": 8101 + }, + { + "epoch": 0.129632, + "grad_norm": 0.5859375, + "learning_rate": 8.774032258064516e-05, + "loss": 0.1585, + "step": 8102 + }, + { + "epoch": 0.129648, + "grad_norm": 0.59765625, + "learning_rate": 8.773870967741936e-05, + "loss": 0.1368, + "step": 8103 + }, + { + "epoch": 0.129664, + "grad_norm": 0.7265625, + "learning_rate": 8.773709677419355e-05, + "loss": 0.1594, + "step": 8104 + }, + { + "epoch": 0.12968, + "grad_norm": 0.63671875, + "learning_rate": 8.773548387096775e-05, + "loss": 0.1684, + "step": 8105 + }, + { + "epoch": 0.129696, + "grad_norm": 1.1484375, + "learning_rate": 8.773387096774193e-05, + "loss": 0.1961, + "step": 8106 + }, + { + "epoch": 0.129712, + "grad_norm": 0.70703125, + "learning_rate": 8.773225806451613e-05, + "loss": 0.1831, + "step": 8107 + }, + { + "epoch": 0.129728, + "grad_norm": 1.078125, + "learning_rate": 8.773064516129032e-05, + "loss": 0.1904, + "step": 8108 + }, + { + "epoch": 0.129744, + "grad_norm": 0.75, + "learning_rate": 8.772903225806452e-05, + "loss": 0.1809, + "step": 8109 + }, + { + "epoch": 0.12976, + "grad_norm": 0.58984375, + "learning_rate": 8.772741935483872e-05, + "loss": 0.1666, + "step": 8110 + }, + { + "epoch": 0.129776, + "grad_norm": 0.96484375, + "learning_rate": 8.77258064516129e-05, + "loss": 0.1779, + "step": 8111 + }, + { + "epoch": 0.129792, + "grad_norm": 0.69921875, + "learning_rate": 8.77241935483871e-05, + "loss": 0.1809, + "step": 8112 + }, + { + "epoch": 0.129808, + "grad_norm": 0.6171875, + "learning_rate": 8.772258064516129e-05, + "loss": 0.1574, + "step": 8113 + }, + { + "epoch": 0.129824, + "grad_norm": 0.5546875, + "learning_rate": 8.772096774193549e-05, + "loss": 0.1686, + "step": 8114 + }, + { + "epoch": 0.12984, + "grad_norm": 0.97265625, + "learning_rate": 8.771935483870968e-05, + "loss": 0.1521, + "step": 8115 + }, + { + "epoch": 0.129856, + "grad_norm": 0.68359375, + "learning_rate": 8.771774193548388e-05, + "loss": 0.1618, + "step": 8116 + }, + { + "epoch": 0.129872, + "grad_norm": 0.640625, + "learning_rate": 8.771612903225806e-05, + "loss": 0.1592, + "step": 8117 + }, + { + "epoch": 0.129888, + "grad_norm": 1.1171875, + "learning_rate": 8.771451612903226e-05, + "loss": 0.2323, + "step": 8118 + }, + { + "epoch": 0.129904, + "grad_norm": 0.8359375, + "learning_rate": 8.771290322580645e-05, + "loss": 0.1739, + "step": 8119 + }, + { + "epoch": 0.12992, + "grad_norm": 0.6640625, + "learning_rate": 8.771129032258065e-05, + "loss": 0.1557, + "step": 8120 + }, + { + "epoch": 0.129936, + "grad_norm": 0.94140625, + "learning_rate": 8.770967741935485e-05, + "loss": 0.1909, + "step": 8121 + }, + { + "epoch": 0.129952, + "grad_norm": 1.125, + "learning_rate": 8.770806451612905e-05, + "loss": 0.1734, + "step": 8122 + }, + { + "epoch": 0.129968, + "grad_norm": 0.765625, + "learning_rate": 8.770645161290323e-05, + "loss": 0.1922, + "step": 8123 + }, + { + "epoch": 0.129984, + "grad_norm": 0.72265625, + "learning_rate": 8.770483870967743e-05, + "loss": 0.2005, + "step": 8124 + }, + { + "epoch": 0.13, + "grad_norm": 0.7265625, + "learning_rate": 8.770322580645162e-05, + "loss": 0.16, + "step": 8125 + }, + { + "epoch": 0.130016, + "grad_norm": 0.6171875, + "learning_rate": 8.770161290322582e-05, + "loss": 0.1491, + "step": 8126 + }, + { + "epoch": 0.130032, + "grad_norm": 0.91796875, + "learning_rate": 8.77e-05, + "loss": 0.2153, + "step": 8127 + }, + { + "epoch": 0.130048, + "grad_norm": 0.8046875, + "learning_rate": 8.769838709677419e-05, + "loss": 0.2147, + "step": 8128 + }, + { + "epoch": 0.130064, + "grad_norm": 0.7421875, + "learning_rate": 8.769677419354839e-05, + "loss": 0.1602, + "step": 8129 + }, + { + "epoch": 0.13008, + "grad_norm": 0.796875, + "learning_rate": 8.769516129032258e-05, + "loss": 0.1627, + "step": 8130 + }, + { + "epoch": 0.130096, + "grad_norm": 1.25, + "learning_rate": 8.769354838709678e-05, + "loss": 0.174, + "step": 8131 + }, + { + "epoch": 0.130112, + "grad_norm": 0.83984375, + "learning_rate": 8.769193548387098e-05, + "loss": 0.2542, + "step": 8132 + }, + { + "epoch": 0.130128, + "grad_norm": 1.21875, + "learning_rate": 8.769032258064516e-05, + "loss": 0.1831, + "step": 8133 + }, + { + "epoch": 0.130144, + "grad_norm": 0.62109375, + "learning_rate": 8.768870967741936e-05, + "loss": 0.1278, + "step": 8134 + }, + { + "epoch": 0.13016, + "grad_norm": 0.62890625, + "learning_rate": 8.768709677419356e-05, + "loss": 0.1497, + "step": 8135 + }, + { + "epoch": 0.130176, + "grad_norm": 0.86328125, + "learning_rate": 8.768548387096775e-05, + "loss": 0.148, + "step": 8136 + }, + { + "epoch": 0.130192, + "grad_norm": 0.57421875, + "learning_rate": 8.768387096774195e-05, + "loss": 0.14, + "step": 8137 + }, + { + "epoch": 0.130208, + "grad_norm": 0.59765625, + "learning_rate": 8.768225806451613e-05, + "loss": 0.1804, + "step": 8138 + }, + { + "epoch": 0.130224, + "grad_norm": 0.57421875, + "learning_rate": 8.768064516129033e-05, + "loss": 0.1496, + "step": 8139 + }, + { + "epoch": 0.13024, + "grad_norm": 0.86328125, + "learning_rate": 8.767903225806452e-05, + "loss": 0.1518, + "step": 8140 + }, + { + "epoch": 0.130256, + "grad_norm": 0.8125, + "learning_rate": 8.767741935483872e-05, + "loss": 0.1829, + "step": 8141 + }, + { + "epoch": 0.130272, + "grad_norm": 0.5546875, + "learning_rate": 8.76758064516129e-05, + "loss": 0.1647, + "step": 8142 + }, + { + "epoch": 0.130288, + "grad_norm": 0.73046875, + "learning_rate": 8.767419354838709e-05, + "loss": 0.1957, + "step": 8143 + }, + { + "epoch": 0.130304, + "grad_norm": 0.7421875, + "learning_rate": 8.767258064516129e-05, + "loss": 0.1652, + "step": 8144 + }, + { + "epoch": 0.13032, + "grad_norm": 0.60546875, + "learning_rate": 8.767096774193549e-05, + "loss": 0.1576, + "step": 8145 + }, + { + "epoch": 0.130336, + "grad_norm": 0.7890625, + "learning_rate": 8.766935483870969e-05, + "loss": 0.1854, + "step": 8146 + }, + { + "epoch": 0.130352, + "grad_norm": 0.78125, + "learning_rate": 8.766774193548387e-05, + "loss": 0.1785, + "step": 8147 + }, + { + "epoch": 0.130368, + "grad_norm": 0.86328125, + "learning_rate": 8.766612903225807e-05, + "loss": 0.1536, + "step": 8148 + }, + { + "epoch": 0.130384, + "grad_norm": 0.94140625, + "learning_rate": 8.766451612903226e-05, + "loss": 0.162, + "step": 8149 + }, + { + "epoch": 0.1304, + "grad_norm": 0.90625, + "learning_rate": 8.766290322580646e-05, + "loss": 0.1927, + "step": 8150 + }, + { + "epoch": 0.130416, + "grad_norm": 1.2890625, + "learning_rate": 8.766129032258065e-05, + "loss": 0.2025, + "step": 8151 + }, + { + "epoch": 0.130432, + "grad_norm": 0.8515625, + "learning_rate": 8.765967741935485e-05, + "loss": 0.1388, + "step": 8152 + }, + { + "epoch": 0.130448, + "grad_norm": 2.140625, + "learning_rate": 8.765806451612903e-05, + "loss": 0.2194, + "step": 8153 + }, + { + "epoch": 0.130464, + "grad_norm": 0.9453125, + "learning_rate": 8.765645161290323e-05, + "loss": 0.1679, + "step": 8154 + }, + { + "epoch": 0.13048, + "grad_norm": 0.84765625, + "learning_rate": 8.765483870967742e-05, + "loss": 0.1856, + "step": 8155 + }, + { + "epoch": 0.130496, + "grad_norm": 1.3046875, + "learning_rate": 8.765322580645162e-05, + "loss": 0.1813, + "step": 8156 + }, + { + "epoch": 0.130512, + "grad_norm": 1.15625, + "learning_rate": 8.765161290322582e-05, + "loss": 0.2055, + "step": 8157 + }, + { + "epoch": 0.130528, + "grad_norm": 0.7578125, + "learning_rate": 8.765e-05, + "loss": 0.2086, + "step": 8158 + }, + { + "epoch": 0.130544, + "grad_norm": 0.8671875, + "learning_rate": 8.76483870967742e-05, + "loss": 0.2269, + "step": 8159 + }, + { + "epoch": 0.13056, + "grad_norm": 0.7734375, + "learning_rate": 8.764677419354839e-05, + "loss": 0.1449, + "step": 8160 + }, + { + "epoch": 0.130576, + "grad_norm": 0.796875, + "learning_rate": 8.764516129032259e-05, + "loss": 0.1275, + "step": 8161 + }, + { + "epoch": 0.130592, + "grad_norm": 0.7578125, + "learning_rate": 8.764354838709677e-05, + "loss": 0.1853, + "step": 8162 + }, + { + "epoch": 0.130608, + "grad_norm": 0.81640625, + "learning_rate": 8.764193548387097e-05, + "loss": 0.1553, + "step": 8163 + }, + { + "epoch": 0.130624, + "grad_norm": 0.9375, + "learning_rate": 8.764032258064516e-05, + "loss": 0.2091, + "step": 8164 + }, + { + "epoch": 0.13064, + "grad_norm": 0.8125, + "learning_rate": 8.763870967741936e-05, + "loss": 0.1558, + "step": 8165 + }, + { + "epoch": 0.130656, + "grad_norm": 0.83984375, + "learning_rate": 8.763709677419355e-05, + "loss": 0.178, + "step": 8166 + }, + { + "epoch": 0.130672, + "grad_norm": 0.65234375, + "learning_rate": 8.763548387096775e-05, + "loss": 0.1456, + "step": 8167 + }, + { + "epoch": 0.130688, + "grad_norm": 0.984375, + "learning_rate": 8.763387096774193e-05, + "loss": 0.222, + "step": 8168 + }, + { + "epoch": 0.130704, + "grad_norm": 0.7578125, + "learning_rate": 8.763225806451613e-05, + "loss": 0.1887, + "step": 8169 + }, + { + "epoch": 0.13072, + "grad_norm": 0.61328125, + "learning_rate": 8.763064516129033e-05, + "loss": 0.1531, + "step": 8170 + }, + { + "epoch": 0.130736, + "grad_norm": 0.66796875, + "learning_rate": 8.762903225806453e-05, + "loss": 0.1703, + "step": 8171 + }, + { + "epoch": 0.130752, + "grad_norm": 0.9296875, + "learning_rate": 8.762741935483872e-05, + "loss": 0.1712, + "step": 8172 + }, + { + "epoch": 0.130768, + "grad_norm": 1.6015625, + "learning_rate": 8.762580645161292e-05, + "loss": 0.1894, + "step": 8173 + }, + { + "epoch": 0.130784, + "grad_norm": 0.9765625, + "learning_rate": 8.76241935483871e-05, + "loss": 0.173, + "step": 8174 + }, + { + "epoch": 0.1308, + "grad_norm": 0.89453125, + "learning_rate": 8.762258064516129e-05, + "loss": 0.1886, + "step": 8175 + }, + { + "epoch": 0.130816, + "grad_norm": 1.03125, + "learning_rate": 8.762096774193549e-05, + "loss": 0.1756, + "step": 8176 + }, + { + "epoch": 0.130832, + "grad_norm": 0.8671875, + "learning_rate": 8.761935483870967e-05, + "loss": 0.1932, + "step": 8177 + }, + { + "epoch": 0.130848, + "grad_norm": 0.8125, + "learning_rate": 8.761774193548387e-05, + "loss": 0.2184, + "step": 8178 + }, + { + "epoch": 0.130864, + "grad_norm": 0.93359375, + "learning_rate": 8.761612903225806e-05, + "loss": 0.2252, + "step": 8179 + }, + { + "epoch": 0.13088, + "grad_norm": 0.6015625, + "learning_rate": 8.761451612903226e-05, + "loss": 0.1837, + "step": 8180 + }, + { + "epoch": 0.130896, + "grad_norm": 1.25, + "learning_rate": 8.761290322580646e-05, + "loss": 0.1555, + "step": 8181 + }, + { + "epoch": 0.130912, + "grad_norm": 0.66015625, + "learning_rate": 8.761129032258066e-05, + "loss": 0.1809, + "step": 8182 + }, + { + "epoch": 0.130928, + "grad_norm": 0.84375, + "learning_rate": 8.760967741935484e-05, + "loss": 0.143, + "step": 8183 + }, + { + "epoch": 0.130944, + "grad_norm": 1.203125, + "learning_rate": 8.760806451612904e-05, + "loss": 0.1922, + "step": 8184 + }, + { + "epoch": 0.13096, + "grad_norm": 0.5234375, + "learning_rate": 8.760645161290323e-05, + "loss": 0.1547, + "step": 8185 + }, + { + "epoch": 0.130976, + "grad_norm": 0.5859375, + "learning_rate": 8.760483870967743e-05, + "loss": 0.1884, + "step": 8186 + }, + { + "epoch": 0.130992, + "grad_norm": 0.703125, + "learning_rate": 8.760322580645162e-05, + "loss": 0.1722, + "step": 8187 + }, + { + "epoch": 0.131008, + "grad_norm": 1.125, + "learning_rate": 8.760161290322582e-05, + "loss": 0.1569, + "step": 8188 + }, + { + "epoch": 0.131024, + "grad_norm": 1.125, + "learning_rate": 8.76e-05, + "loss": 0.1892, + "step": 8189 + }, + { + "epoch": 0.13104, + "grad_norm": 1.078125, + "learning_rate": 8.759838709677419e-05, + "loss": 0.1714, + "step": 8190 + }, + { + "epoch": 0.131056, + "grad_norm": 0.71875, + "learning_rate": 8.759677419354839e-05, + "loss": 0.186, + "step": 8191 + }, + { + "epoch": 0.131072, + "grad_norm": 0.63671875, + "learning_rate": 8.759516129032259e-05, + "loss": 0.1484, + "step": 8192 + }, + { + "epoch": 0.131088, + "grad_norm": 1.2734375, + "learning_rate": 8.759354838709679e-05, + "loss": 0.1922, + "step": 8193 + }, + { + "epoch": 0.131104, + "grad_norm": 1.03125, + "learning_rate": 8.759193548387097e-05, + "loss": 0.2156, + "step": 8194 + }, + { + "epoch": 0.13112, + "grad_norm": 0.6015625, + "learning_rate": 8.759032258064517e-05, + "loss": 0.1543, + "step": 8195 + }, + { + "epoch": 0.131136, + "grad_norm": 0.765625, + "learning_rate": 8.758870967741936e-05, + "loss": 0.2074, + "step": 8196 + }, + { + "epoch": 0.131152, + "grad_norm": 0.9296875, + "learning_rate": 8.758709677419356e-05, + "loss": 0.2473, + "step": 8197 + }, + { + "epoch": 0.131168, + "grad_norm": 1.390625, + "learning_rate": 8.758548387096774e-05, + "loss": 0.2091, + "step": 8198 + }, + { + "epoch": 0.131184, + "grad_norm": 0.83203125, + "learning_rate": 8.758387096774194e-05, + "loss": 0.1534, + "step": 8199 + }, + { + "epoch": 0.1312, + "grad_norm": 0.765625, + "learning_rate": 8.758225806451613e-05, + "loss": 0.1547, + "step": 8200 + }, + { + "epoch": 0.131216, + "grad_norm": 0.90234375, + "learning_rate": 8.758064516129033e-05, + "loss": 0.1585, + "step": 8201 + }, + { + "epoch": 0.131232, + "grad_norm": 0.78515625, + "learning_rate": 8.757903225806452e-05, + "loss": 0.2075, + "step": 8202 + }, + { + "epoch": 0.131248, + "grad_norm": 0.69140625, + "learning_rate": 8.757741935483872e-05, + "loss": 0.1585, + "step": 8203 + }, + { + "epoch": 0.131264, + "grad_norm": 0.5546875, + "learning_rate": 8.75758064516129e-05, + "loss": 0.1648, + "step": 8204 + }, + { + "epoch": 0.13128, + "grad_norm": 0.75, + "learning_rate": 8.75741935483871e-05, + "loss": 0.1841, + "step": 8205 + }, + { + "epoch": 0.131296, + "grad_norm": 0.76953125, + "learning_rate": 8.75725806451613e-05, + "loss": 0.1681, + "step": 8206 + }, + { + "epoch": 0.131312, + "grad_norm": 0.84765625, + "learning_rate": 8.757096774193549e-05, + "loss": 0.1688, + "step": 8207 + }, + { + "epoch": 0.131328, + "grad_norm": 0.984375, + "learning_rate": 8.756935483870969e-05, + "loss": 0.1749, + "step": 8208 + }, + { + "epoch": 0.131344, + "grad_norm": 1.09375, + "learning_rate": 8.756774193548387e-05, + "loss": 0.1712, + "step": 8209 + }, + { + "epoch": 0.13136, + "grad_norm": 0.64453125, + "learning_rate": 8.756612903225807e-05, + "loss": 0.1356, + "step": 8210 + }, + { + "epoch": 0.131376, + "grad_norm": 0.59765625, + "learning_rate": 8.756451612903226e-05, + "loss": 0.1598, + "step": 8211 + }, + { + "epoch": 0.131392, + "grad_norm": 0.75390625, + "learning_rate": 8.756290322580646e-05, + "loss": 0.1373, + "step": 8212 + }, + { + "epoch": 0.131408, + "grad_norm": 0.640625, + "learning_rate": 8.756129032258064e-05, + "loss": 0.1721, + "step": 8213 + }, + { + "epoch": 0.131424, + "grad_norm": 0.796875, + "learning_rate": 8.755967741935484e-05, + "loss": 0.1806, + "step": 8214 + }, + { + "epoch": 0.13144, + "grad_norm": 1.109375, + "learning_rate": 8.755806451612903e-05, + "loss": 0.1545, + "step": 8215 + }, + { + "epoch": 0.131456, + "grad_norm": 0.5703125, + "learning_rate": 8.755645161290323e-05, + "loss": 0.1858, + "step": 8216 + }, + { + "epoch": 0.131472, + "grad_norm": 0.78515625, + "learning_rate": 8.755483870967743e-05, + "loss": 0.1536, + "step": 8217 + }, + { + "epoch": 0.131488, + "grad_norm": 0.84375, + "learning_rate": 8.755322580645163e-05, + "loss": 0.1493, + "step": 8218 + }, + { + "epoch": 0.131504, + "grad_norm": 0.74609375, + "learning_rate": 8.755161290322581e-05, + "loss": 0.1799, + "step": 8219 + }, + { + "epoch": 0.13152, + "grad_norm": 0.61328125, + "learning_rate": 8.755e-05, + "loss": 0.1616, + "step": 8220 + }, + { + "epoch": 0.131536, + "grad_norm": 0.70703125, + "learning_rate": 8.75483870967742e-05, + "loss": 0.1571, + "step": 8221 + }, + { + "epoch": 0.131552, + "grad_norm": 0.97265625, + "learning_rate": 8.754677419354839e-05, + "loss": 0.167, + "step": 8222 + }, + { + "epoch": 0.131568, + "grad_norm": 0.60546875, + "learning_rate": 8.754516129032259e-05, + "loss": 0.1456, + "step": 8223 + }, + { + "epoch": 0.131584, + "grad_norm": 0.7578125, + "learning_rate": 8.754354838709677e-05, + "loss": 0.1997, + "step": 8224 + }, + { + "epoch": 0.1316, + "grad_norm": 0.63671875, + "learning_rate": 8.754193548387097e-05, + "loss": 0.1814, + "step": 8225 + }, + { + "epoch": 0.131616, + "grad_norm": 0.73828125, + "learning_rate": 8.754032258064516e-05, + "loss": 0.1709, + "step": 8226 + }, + { + "epoch": 0.131632, + "grad_norm": 0.7265625, + "learning_rate": 8.753870967741936e-05, + "loss": 0.167, + "step": 8227 + }, + { + "epoch": 0.131648, + "grad_norm": 1.3671875, + "learning_rate": 8.753709677419354e-05, + "loss": 0.1867, + "step": 8228 + }, + { + "epoch": 0.131664, + "grad_norm": 0.8671875, + "learning_rate": 8.753548387096774e-05, + "loss": 0.1609, + "step": 8229 + }, + { + "epoch": 0.13168, + "grad_norm": 1.5546875, + "learning_rate": 8.753387096774194e-05, + "loss": 0.1685, + "step": 8230 + }, + { + "epoch": 0.131696, + "grad_norm": 0.59765625, + "learning_rate": 8.753225806451614e-05, + "loss": 0.1399, + "step": 8231 + }, + { + "epoch": 0.131712, + "grad_norm": 0.80859375, + "learning_rate": 8.753064516129033e-05, + "loss": 0.1885, + "step": 8232 + }, + { + "epoch": 0.131728, + "grad_norm": 0.92578125, + "learning_rate": 8.752903225806453e-05, + "loss": 0.1869, + "step": 8233 + }, + { + "epoch": 0.131744, + "grad_norm": 0.7265625, + "learning_rate": 8.752741935483871e-05, + "loss": 0.1798, + "step": 8234 + }, + { + "epoch": 0.13176, + "grad_norm": 1.1484375, + "learning_rate": 8.752580645161291e-05, + "loss": 0.1894, + "step": 8235 + }, + { + "epoch": 0.131776, + "grad_norm": 0.75390625, + "learning_rate": 8.75241935483871e-05, + "loss": 0.1459, + "step": 8236 + }, + { + "epoch": 0.131792, + "grad_norm": 0.96484375, + "learning_rate": 8.752258064516129e-05, + "loss": 0.1525, + "step": 8237 + }, + { + "epoch": 0.131808, + "grad_norm": 0.74609375, + "learning_rate": 8.752096774193549e-05, + "loss": 0.1737, + "step": 8238 + }, + { + "epoch": 0.131824, + "grad_norm": 0.62890625, + "learning_rate": 8.751935483870967e-05, + "loss": 0.1478, + "step": 8239 + }, + { + "epoch": 0.13184, + "grad_norm": 1.0234375, + "learning_rate": 8.751774193548387e-05, + "loss": 0.1425, + "step": 8240 + }, + { + "epoch": 0.131856, + "grad_norm": 0.64453125, + "learning_rate": 8.751612903225807e-05, + "loss": 0.1615, + "step": 8241 + }, + { + "epoch": 0.131872, + "grad_norm": 0.74609375, + "learning_rate": 8.751451612903227e-05, + "loss": 0.1828, + "step": 8242 + }, + { + "epoch": 0.131888, + "grad_norm": 0.65234375, + "learning_rate": 8.751290322580646e-05, + "loss": 0.1809, + "step": 8243 + }, + { + "epoch": 0.131904, + "grad_norm": 1.484375, + "learning_rate": 8.751129032258066e-05, + "loss": 0.233, + "step": 8244 + }, + { + "epoch": 0.13192, + "grad_norm": 0.61328125, + "learning_rate": 8.750967741935484e-05, + "loss": 0.1935, + "step": 8245 + }, + { + "epoch": 0.131936, + "grad_norm": 0.921875, + "learning_rate": 8.750806451612904e-05, + "loss": 0.1693, + "step": 8246 + }, + { + "epoch": 0.131952, + "grad_norm": 0.85546875, + "learning_rate": 8.750645161290323e-05, + "loss": 0.1727, + "step": 8247 + }, + { + "epoch": 0.131968, + "grad_norm": 0.84375, + "learning_rate": 8.750483870967743e-05, + "loss": 0.2091, + "step": 8248 + }, + { + "epoch": 0.131984, + "grad_norm": 0.66796875, + "learning_rate": 8.750322580645161e-05, + "loss": 0.1789, + "step": 8249 + }, + { + "epoch": 0.132, + "grad_norm": 0.515625, + "learning_rate": 8.750161290322581e-05, + "loss": 0.1746, + "step": 8250 + }, + { + "epoch": 0.132016, + "grad_norm": 1.0625, + "learning_rate": 8.75e-05, + "loss": 0.2042, + "step": 8251 + }, + { + "epoch": 0.132032, + "grad_norm": 0.7109375, + "learning_rate": 8.74983870967742e-05, + "loss": 0.1751, + "step": 8252 + }, + { + "epoch": 0.132048, + "grad_norm": 0.7734375, + "learning_rate": 8.74967741935484e-05, + "loss": 0.1811, + "step": 8253 + }, + { + "epoch": 0.132064, + "grad_norm": 0.69140625, + "learning_rate": 8.749516129032258e-05, + "loss": 0.1904, + "step": 8254 + }, + { + "epoch": 0.13208, + "grad_norm": 0.6484375, + "learning_rate": 8.749354838709678e-05, + "loss": 0.1495, + "step": 8255 + }, + { + "epoch": 0.132096, + "grad_norm": 0.59375, + "learning_rate": 8.749193548387097e-05, + "loss": 0.15, + "step": 8256 + }, + { + "epoch": 0.132112, + "grad_norm": 0.94140625, + "learning_rate": 8.749032258064517e-05, + "loss": 0.1957, + "step": 8257 + }, + { + "epoch": 0.132128, + "grad_norm": 0.66796875, + "learning_rate": 8.748870967741936e-05, + "loss": 0.1752, + "step": 8258 + }, + { + "epoch": 0.132144, + "grad_norm": 0.94921875, + "learning_rate": 8.748709677419356e-05, + "loss": 0.2227, + "step": 8259 + }, + { + "epoch": 0.13216, + "grad_norm": 1.046875, + "learning_rate": 8.748548387096774e-05, + "loss": 0.1666, + "step": 8260 + }, + { + "epoch": 0.132176, + "grad_norm": 1.015625, + "learning_rate": 8.748387096774194e-05, + "loss": 0.2036, + "step": 8261 + }, + { + "epoch": 0.132192, + "grad_norm": 0.89453125, + "learning_rate": 8.748225806451613e-05, + "loss": 0.1534, + "step": 8262 + }, + { + "epoch": 0.132208, + "grad_norm": 0.578125, + "learning_rate": 8.748064516129033e-05, + "loss": 0.1615, + "step": 8263 + }, + { + "epoch": 0.132224, + "grad_norm": 0.90625, + "learning_rate": 8.747903225806451e-05, + "loss": 0.2067, + "step": 8264 + }, + { + "epoch": 0.13224, + "grad_norm": 0.58203125, + "learning_rate": 8.747741935483871e-05, + "loss": 0.137, + "step": 8265 + }, + { + "epoch": 0.132256, + "grad_norm": 0.6796875, + "learning_rate": 8.747580645161291e-05, + "loss": 0.1707, + "step": 8266 + }, + { + "epoch": 0.132272, + "grad_norm": 0.87109375, + "learning_rate": 8.74741935483871e-05, + "loss": 0.1692, + "step": 8267 + }, + { + "epoch": 0.132288, + "grad_norm": 0.703125, + "learning_rate": 8.74725806451613e-05, + "loss": 0.1953, + "step": 8268 + }, + { + "epoch": 0.132304, + "grad_norm": 0.91796875, + "learning_rate": 8.747096774193548e-05, + "loss": 0.2016, + "step": 8269 + }, + { + "epoch": 0.13232, + "grad_norm": 0.59765625, + "learning_rate": 8.746935483870968e-05, + "loss": 0.1848, + "step": 8270 + }, + { + "epoch": 0.132336, + "grad_norm": 0.6171875, + "learning_rate": 8.746774193548387e-05, + "loss": 0.1406, + "step": 8271 + }, + { + "epoch": 0.132352, + "grad_norm": 0.89453125, + "learning_rate": 8.746612903225807e-05, + "loss": 0.2084, + "step": 8272 + }, + { + "epoch": 0.132368, + "grad_norm": 0.6171875, + "learning_rate": 8.746451612903226e-05, + "loss": 0.1603, + "step": 8273 + }, + { + "epoch": 0.132384, + "grad_norm": 0.84375, + "learning_rate": 8.746290322580646e-05, + "loss": 0.2118, + "step": 8274 + }, + { + "epoch": 0.1324, + "grad_norm": 0.94140625, + "learning_rate": 8.746129032258064e-05, + "loss": 0.1514, + "step": 8275 + }, + { + "epoch": 0.132416, + "grad_norm": 0.6328125, + "learning_rate": 8.745967741935484e-05, + "loss": 0.1342, + "step": 8276 + }, + { + "epoch": 0.132432, + "grad_norm": 0.59765625, + "learning_rate": 8.745806451612904e-05, + "loss": 0.1578, + "step": 8277 + }, + { + "epoch": 0.132448, + "grad_norm": 0.85546875, + "learning_rate": 8.745645161290324e-05, + "loss": 0.182, + "step": 8278 + }, + { + "epoch": 0.132464, + "grad_norm": 1.015625, + "learning_rate": 8.745483870967743e-05, + "loss": 0.1965, + "step": 8279 + }, + { + "epoch": 0.13248, + "grad_norm": 0.9140625, + "learning_rate": 8.745322580645163e-05, + "loss": 0.204, + "step": 8280 + }, + { + "epoch": 0.132496, + "grad_norm": 0.61328125, + "learning_rate": 8.745161290322581e-05, + "loss": 0.1476, + "step": 8281 + }, + { + "epoch": 0.132512, + "grad_norm": 0.7265625, + "learning_rate": 8.745000000000001e-05, + "loss": 0.1524, + "step": 8282 + }, + { + "epoch": 0.132528, + "grad_norm": 0.91015625, + "learning_rate": 8.74483870967742e-05, + "loss": 0.1839, + "step": 8283 + }, + { + "epoch": 0.132544, + "grad_norm": 0.76171875, + "learning_rate": 8.744677419354838e-05, + "loss": 0.2016, + "step": 8284 + }, + { + "epoch": 0.13256, + "grad_norm": 1.84375, + "learning_rate": 8.744516129032258e-05, + "loss": 0.2009, + "step": 8285 + }, + { + "epoch": 0.132576, + "grad_norm": 0.68359375, + "learning_rate": 8.744354838709677e-05, + "loss": 0.1841, + "step": 8286 + }, + { + "epoch": 0.132592, + "grad_norm": 0.9140625, + "learning_rate": 8.744193548387097e-05, + "loss": 0.1771, + "step": 8287 + }, + { + "epoch": 0.132608, + "grad_norm": 0.82421875, + "learning_rate": 8.744032258064517e-05, + "loss": 0.1708, + "step": 8288 + }, + { + "epoch": 0.132624, + "grad_norm": 0.56640625, + "learning_rate": 8.743870967741937e-05, + "loss": 0.1409, + "step": 8289 + }, + { + "epoch": 0.13264, + "grad_norm": 0.58203125, + "learning_rate": 8.743709677419355e-05, + "loss": 0.1722, + "step": 8290 + }, + { + "epoch": 0.132656, + "grad_norm": 1.109375, + "learning_rate": 8.743548387096775e-05, + "loss": 0.1817, + "step": 8291 + }, + { + "epoch": 0.132672, + "grad_norm": 1.015625, + "learning_rate": 8.743387096774194e-05, + "loss": 0.1675, + "step": 8292 + }, + { + "epoch": 0.132688, + "grad_norm": 0.703125, + "learning_rate": 8.743225806451614e-05, + "loss": 0.172, + "step": 8293 + }, + { + "epoch": 0.132704, + "grad_norm": 0.66796875, + "learning_rate": 8.743064516129033e-05, + "loss": 0.1691, + "step": 8294 + }, + { + "epoch": 0.13272, + "grad_norm": 1.0234375, + "learning_rate": 8.742903225806453e-05, + "loss": 0.1771, + "step": 8295 + }, + { + "epoch": 0.132736, + "grad_norm": 0.859375, + "learning_rate": 8.742741935483871e-05, + "loss": 0.1764, + "step": 8296 + }, + { + "epoch": 0.132752, + "grad_norm": 0.921875, + "learning_rate": 8.742580645161291e-05, + "loss": 0.1808, + "step": 8297 + }, + { + "epoch": 0.132768, + "grad_norm": 0.625, + "learning_rate": 8.74241935483871e-05, + "loss": 0.1869, + "step": 8298 + }, + { + "epoch": 0.132784, + "grad_norm": 0.76171875, + "learning_rate": 8.742258064516128e-05, + "loss": 0.1449, + "step": 8299 + }, + { + "epoch": 0.1328, + "grad_norm": 0.7890625, + "learning_rate": 8.742096774193548e-05, + "loss": 0.1725, + "step": 8300 + }, + { + "epoch": 0.132816, + "grad_norm": 1.15625, + "learning_rate": 8.741935483870968e-05, + "loss": 0.1733, + "step": 8301 + }, + { + "epoch": 0.132832, + "grad_norm": 0.56640625, + "learning_rate": 8.741774193548388e-05, + "loss": 0.1948, + "step": 8302 + }, + { + "epoch": 0.132848, + "grad_norm": 0.8984375, + "learning_rate": 8.741612903225807e-05, + "loss": 0.1757, + "step": 8303 + }, + { + "epoch": 0.132864, + "grad_norm": 0.78125, + "learning_rate": 8.741451612903227e-05, + "loss": 0.2057, + "step": 8304 + }, + { + "epoch": 0.13288, + "grad_norm": 0.83984375, + "learning_rate": 8.741290322580645e-05, + "loss": 0.1886, + "step": 8305 + }, + { + "epoch": 0.132896, + "grad_norm": 1.3828125, + "learning_rate": 8.741129032258065e-05, + "loss": 0.1875, + "step": 8306 + }, + { + "epoch": 0.132912, + "grad_norm": 1.171875, + "learning_rate": 8.740967741935484e-05, + "loss": 0.1887, + "step": 8307 + }, + { + "epoch": 0.132928, + "grad_norm": 0.6796875, + "learning_rate": 8.740806451612904e-05, + "loss": 0.1903, + "step": 8308 + }, + { + "epoch": 0.132944, + "grad_norm": 0.7265625, + "learning_rate": 8.740645161290323e-05, + "loss": 0.1844, + "step": 8309 + }, + { + "epoch": 0.13296, + "grad_norm": 0.88671875, + "learning_rate": 8.740483870967742e-05, + "loss": 0.1617, + "step": 8310 + }, + { + "epoch": 0.132976, + "grad_norm": 1.1171875, + "learning_rate": 8.740322580645161e-05, + "loss": 0.193, + "step": 8311 + }, + { + "epoch": 0.132992, + "grad_norm": 0.765625, + "learning_rate": 8.740161290322581e-05, + "loss": 0.1625, + "step": 8312 + }, + { + "epoch": 0.133008, + "grad_norm": 0.70703125, + "learning_rate": 8.740000000000001e-05, + "loss": 0.2024, + "step": 8313 + }, + { + "epoch": 0.133024, + "grad_norm": 1.5078125, + "learning_rate": 8.73983870967742e-05, + "loss": 0.1815, + "step": 8314 + }, + { + "epoch": 0.13304, + "grad_norm": 1.0625, + "learning_rate": 8.73967741935484e-05, + "loss": 0.2176, + "step": 8315 + }, + { + "epoch": 0.133056, + "grad_norm": 0.87109375, + "learning_rate": 8.739516129032258e-05, + "loss": 0.1626, + "step": 8316 + }, + { + "epoch": 0.133072, + "grad_norm": 0.68359375, + "learning_rate": 8.739354838709678e-05, + "loss": 0.1697, + "step": 8317 + }, + { + "epoch": 0.133088, + "grad_norm": 0.6484375, + "learning_rate": 8.739193548387097e-05, + "loss": 0.1567, + "step": 8318 + }, + { + "epoch": 0.133104, + "grad_norm": 0.6875, + "learning_rate": 8.739032258064517e-05, + "loss": 0.1621, + "step": 8319 + }, + { + "epoch": 0.13312, + "grad_norm": 0.5859375, + "learning_rate": 8.738870967741935e-05, + "loss": 0.1672, + "step": 8320 + }, + { + "epoch": 0.133136, + "grad_norm": 1.2734375, + "learning_rate": 8.738709677419355e-05, + "loss": 0.1962, + "step": 8321 + }, + { + "epoch": 0.133152, + "grad_norm": 0.94140625, + "learning_rate": 8.738548387096774e-05, + "loss": 0.1633, + "step": 8322 + }, + { + "epoch": 0.133168, + "grad_norm": 1.109375, + "learning_rate": 8.738387096774194e-05, + "loss": 0.1811, + "step": 8323 + }, + { + "epoch": 0.133184, + "grad_norm": 0.82421875, + "learning_rate": 8.738225806451612e-05, + "loss": 0.199, + "step": 8324 + }, + { + "epoch": 0.1332, + "grad_norm": 1.0625, + "learning_rate": 8.738064516129032e-05, + "loss": 0.1945, + "step": 8325 + }, + { + "epoch": 0.133216, + "grad_norm": 0.8203125, + "learning_rate": 8.737903225806452e-05, + "loss": 0.1699, + "step": 8326 + }, + { + "epoch": 0.133232, + "grad_norm": 0.99609375, + "learning_rate": 8.737741935483872e-05, + "loss": 0.1928, + "step": 8327 + }, + { + "epoch": 0.133248, + "grad_norm": 1.125, + "learning_rate": 8.737580645161291e-05, + "loss": 0.199, + "step": 8328 + }, + { + "epoch": 0.133264, + "grad_norm": 0.6875, + "learning_rate": 8.73741935483871e-05, + "loss": 0.158, + "step": 8329 + }, + { + "epoch": 0.13328, + "grad_norm": 0.921875, + "learning_rate": 8.73725806451613e-05, + "loss": 0.1529, + "step": 8330 + }, + { + "epoch": 0.133296, + "grad_norm": 1.4296875, + "learning_rate": 8.737096774193548e-05, + "loss": 0.1782, + "step": 8331 + }, + { + "epoch": 0.133312, + "grad_norm": 0.73828125, + "learning_rate": 8.736935483870968e-05, + "loss": 0.167, + "step": 8332 + }, + { + "epoch": 0.133328, + "grad_norm": 0.90234375, + "learning_rate": 8.736774193548387e-05, + "loss": 0.1696, + "step": 8333 + }, + { + "epoch": 0.133344, + "grad_norm": 0.64453125, + "learning_rate": 8.736612903225807e-05, + "loss": 0.1617, + "step": 8334 + }, + { + "epoch": 0.13336, + "grad_norm": 1.1328125, + "learning_rate": 8.736451612903225e-05, + "loss": 0.1903, + "step": 8335 + }, + { + "epoch": 0.133376, + "grad_norm": 0.8671875, + "learning_rate": 8.736290322580645e-05, + "loss": 0.151, + "step": 8336 + }, + { + "epoch": 0.133392, + "grad_norm": 0.60546875, + "learning_rate": 8.736129032258065e-05, + "loss": 0.1871, + "step": 8337 + }, + { + "epoch": 0.133408, + "grad_norm": 1.203125, + "learning_rate": 8.735967741935485e-05, + "loss": 0.2013, + "step": 8338 + }, + { + "epoch": 0.133424, + "grad_norm": 0.87890625, + "learning_rate": 8.735806451612904e-05, + "loss": 0.1763, + "step": 8339 + }, + { + "epoch": 0.13344, + "grad_norm": 0.87109375, + "learning_rate": 8.735645161290324e-05, + "loss": 0.1727, + "step": 8340 + }, + { + "epoch": 0.133456, + "grad_norm": 0.8515625, + "learning_rate": 8.735483870967742e-05, + "loss": 0.1711, + "step": 8341 + }, + { + "epoch": 0.133472, + "grad_norm": 0.69140625, + "learning_rate": 8.735322580645162e-05, + "loss": 0.188, + "step": 8342 + }, + { + "epoch": 0.133488, + "grad_norm": 1.1953125, + "learning_rate": 8.735161290322581e-05, + "loss": 0.1651, + "step": 8343 + }, + { + "epoch": 0.133504, + "grad_norm": 0.88671875, + "learning_rate": 8.735000000000001e-05, + "loss": 0.155, + "step": 8344 + }, + { + "epoch": 0.13352, + "grad_norm": 0.859375, + "learning_rate": 8.73483870967742e-05, + "loss": 0.1914, + "step": 8345 + }, + { + "epoch": 0.133536, + "grad_norm": 1.1640625, + "learning_rate": 8.734677419354838e-05, + "loss": 0.1748, + "step": 8346 + }, + { + "epoch": 0.133552, + "grad_norm": 0.71484375, + "learning_rate": 8.734516129032258e-05, + "loss": 0.1939, + "step": 8347 + }, + { + "epoch": 0.133568, + "grad_norm": 0.73046875, + "learning_rate": 8.734354838709678e-05, + "loss": 0.1902, + "step": 8348 + }, + { + "epoch": 0.133584, + "grad_norm": 0.859375, + "learning_rate": 8.734193548387098e-05, + "loss": 0.1787, + "step": 8349 + }, + { + "epoch": 0.1336, + "grad_norm": 0.671875, + "learning_rate": 8.734032258064517e-05, + "loss": 0.1628, + "step": 8350 + }, + { + "epoch": 0.133616, + "grad_norm": 0.71875, + "learning_rate": 8.733870967741937e-05, + "loss": 0.164, + "step": 8351 + }, + { + "epoch": 0.133632, + "grad_norm": 0.6875, + "learning_rate": 8.733709677419355e-05, + "loss": 0.1756, + "step": 8352 + }, + { + "epoch": 0.133648, + "grad_norm": 1.15625, + "learning_rate": 8.733548387096775e-05, + "loss": 0.1618, + "step": 8353 + }, + { + "epoch": 0.133664, + "grad_norm": 1.15625, + "learning_rate": 8.733387096774194e-05, + "loss": 0.2049, + "step": 8354 + }, + { + "epoch": 0.13368, + "grad_norm": 1.015625, + "learning_rate": 8.733225806451614e-05, + "loss": 0.1724, + "step": 8355 + }, + { + "epoch": 0.133696, + "grad_norm": 0.6953125, + "learning_rate": 8.733064516129032e-05, + "loss": 0.1477, + "step": 8356 + }, + { + "epoch": 0.133712, + "grad_norm": 1.0859375, + "learning_rate": 8.732903225806452e-05, + "loss": 0.1908, + "step": 8357 + }, + { + "epoch": 0.133728, + "grad_norm": 1.0078125, + "learning_rate": 8.732741935483871e-05, + "loss": 0.1786, + "step": 8358 + }, + { + "epoch": 0.133744, + "grad_norm": 1.3515625, + "learning_rate": 8.732580645161291e-05, + "loss": 0.1821, + "step": 8359 + }, + { + "epoch": 0.13376, + "grad_norm": 1.6171875, + "learning_rate": 8.73241935483871e-05, + "loss": 0.183, + "step": 8360 + }, + { + "epoch": 0.133776, + "grad_norm": 0.97265625, + "learning_rate": 8.73225806451613e-05, + "loss": 0.1429, + "step": 8361 + }, + { + "epoch": 0.133792, + "grad_norm": 0.5390625, + "learning_rate": 8.73209677419355e-05, + "loss": 0.1413, + "step": 8362 + }, + { + "epoch": 0.133808, + "grad_norm": 0.625, + "learning_rate": 8.731935483870968e-05, + "loss": 0.17, + "step": 8363 + }, + { + "epoch": 0.133824, + "grad_norm": 0.76171875, + "learning_rate": 8.731774193548388e-05, + "loss": 0.1962, + "step": 8364 + }, + { + "epoch": 0.13384, + "grad_norm": 0.75390625, + "learning_rate": 8.731612903225807e-05, + "loss": 0.1923, + "step": 8365 + }, + { + "epoch": 0.133856, + "grad_norm": 0.64453125, + "learning_rate": 8.731451612903227e-05, + "loss": 0.1636, + "step": 8366 + }, + { + "epoch": 0.133872, + "grad_norm": 0.9921875, + "learning_rate": 8.731290322580645e-05, + "loss": 0.2533, + "step": 8367 + }, + { + "epoch": 0.133888, + "grad_norm": 0.734375, + "learning_rate": 8.731129032258065e-05, + "loss": 0.1825, + "step": 8368 + }, + { + "epoch": 0.133904, + "grad_norm": 0.84765625, + "learning_rate": 8.730967741935484e-05, + "loss": 0.1705, + "step": 8369 + }, + { + "epoch": 0.13392, + "grad_norm": 1.078125, + "learning_rate": 8.730806451612904e-05, + "loss": 0.2069, + "step": 8370 + }, + { + "epoch": 0.133936, + "grad_norm": 0.9609375, + "learning_rate": 8.730645161290322e-05, + "loss": 0.1881, + "step": 8371 + }, + { + "epoch": 0.133952, + "grad_norm": 1.046875, + "learning_rate": 8.730483870967742e-05, + "loss": 0.1782, + "step": 8372 + }, + { + "epoch": 0.133968, + "grad_norm": 0.9765625, + "learning_rate": 8.730322580645162e-05, + "loss": 0.1758, + "step": 8373 + }, + { + "epoch": 0.133984, + "grad_norm": 0.66796875, + "learning_rate": 8.730161290322582e-05, + "loss": 0.161, + "step": 8374 + }, + { + "epoch": 0.134, + "grad_norm": 0.8359375, + "learning_rate": 8.730000000000001e-05, + "loss": 0.212, + "step": 8375 + }, + { + "epoch": 0.134016, + "grad_norm": 0.734375, + "learning_rate": 8.72983870967742e-05, + "loss": 0.1864, + "step": 8376 + }, + { + "epoch": 0.134032, + "grad_norm": 1.140625, + "learning_rate": 8.72967741935484e-05, + "loss": 0.2079, + "step": 8377 + }, + { + "epoch": 0.134048, + "grad_norm": 0.7890625, + "learning_rate": 8.729516129032258e-05, + "loss": 0.1711, + "step": 8378 + }, + { + "epoch": 0.134064, + "grad_norm": 0.78125, + "learning_rate": 8.729354838709678e-05, + "loss": 0.1988, + "step": 8379 + }, + { + "epoch": 0.13408, + "grad_norm": 1.0078125, + "learning_rate": 8.729193548387097e-05, + "loss": 0.1771, + "step": 8380 + }, + { + "epoch": 0.134096, + "grad_norm": 1.140625, + "learning_rate": 8.729032258064516e-05, + "loss": 0.1686, + "step": 8381 + }, + { + "epoch": 0.134112, + "grad_norm": 0.9296875, + "learning_rate": 8.728870967741935e-05, + "loss": 0.2098, + "step": 8382 + }, + { + "epoch": 0.134128, + "grad_norm": 0.890625, + "learning_rate": 8.728709677419355e-05, + "loss": 0.1829, + "step": 8383 + }, + { + "epoch": 0.134144, + "grad_norm": 0.8203125, + "learning_rate": 8.728548387096775e-05, + "loss": 0.1533, + "step": 8384 + }, + { + "epoch": 0.13416, + "grad_norm": 0.734375, + "learning_rate": 8.728387096774194e-05, + "loss": 0.1936, + "step": 8385 + }, + { + "epoch": 0.134176, + "grad_norm": 0.7421875, + "learning_rate": 8.728225806451614e-05, + "loss": 0.1638, + "step": 8386 + }, + { + "epoch": 0.134192, + "grad_norm": 0.6171875, + "learning_rate": 8.728064516129034e-05, + "loss": 0.1573, + "step": 8387 + }, + { + "epoch": 0.134208, + "grad_norm": 0.7109375, + "learning_rate": 8.727903225806452e-05, + "loss": 0.2008, + "step": 8388 + }, + { + "epoch": 0.134224, + "grad_norm": 1.21875, + "learning_rate": 8.727741935483872e-05, + "loss": 0.217, + "step": 8389 + }, + { + "epoch": 0.13424, + "grad_norm": 0.81640625, + "learning_rate": 8.727580645161291e-05, + "loss": 0.1505, + "step": 8390 + }, + { + "epoch": 0.134256, + "grad_norm": 0.7890625, + "learning_rate": 8.727419354838711e-05, + "loss": 0.1774, + "step": 8391 + }, + { + "epoch": 0.134272, + "grad_norm": 1.0859375, + "learning_rate": 8.727258064516129e-05, + "loss": 0.1793, + "step": 8392 + }, + { + "epoch": 0.134288, + "grad_norm": 1.03125, + "learning_rate": 8.727096774193548e-05, + "loss": 0.2101, + "step": 8393 + }, + { + "epoch": 0.134304, + "grad_norm": 1.046875, + "learning_rate": 8.726935483870968e-05, + "loss": 0.2338, + "step": 8394 + }, + { + "epoch": 0.13432, + "grad_norm": 1.3984375, + "learning_rate": 8.726774193548386e-05, + "loss": 0.165, + "step": 8395 + }, + { + "epoch": 0.134336, + "grad_norm": 0.8828125, + "learning_rate": 8.726612903225806e-05, + "loss": 0.1881, + "step": 8396 + }, + { + "epoch": 0.134352, + "grad_norm": 0.78515625, + "learning_rate": 8.726451612903226e-05, + "loss": 0.1476, + "step": 8397 + }, + { + "epoch": 0.134368, + "grad_norm": 0.50390625, + "learning_rate": 8.726290322580646e-05, + "loss": 0.1398, + "step": 8398 + }, + { + "epoch": 0.134384, + "grad_norm": 0.7890625, + "learning_rate": 8.726129032258065e-05, + "loss": 0.2025, + "step": 8399 + }, + { + "epoch": 0.1344, + "grad_norm": 1.0, + "learning_rate": 8.725967741935485e-05, + "loss": 0.1639, + "step": 8400 + }, + { + "epoch": 0.134416, + "grad_norm": 0.5625, + "learning_rate": 8.725806451612904e-05, + "loss": 0.1466, + "step": 8401 + }, + { + "epoch": 0.134432, + "grad_norm": 0.67578125, + "learning_rate": 8.725645161290324e-05, + "loss": 0.1807, + "step": 8402 + }, + { + "epoch": 0.134448, + "grad_norm": 1.2421875, + "learning_rate": 8.725483870967742e-05, + "loss": 0.2408, + "step": 8403 + }, + { + "epoch": 0.134464, + "grad_norm": 1.109375, + "learning_rate": 8.725322580645162e-05, + "loss": 0.1691, + "step": 8404 + }, + { + "epoch": 0.13448, + "grad_norm": 1.1796875, + "learning_rate": 8.725161290322581e-05, + "loss": 0.1733, + "step": 8405 + }, + { + "epoch": 0.134496, + "grad_norm": 1.375, + "learning_rate": 8.725e-05, + "loss": 0.138, + "step": 8406 + }, + { + "epoch": 0.134512, + "grad_norm": 1.3828125, + "learning_rate": 8.724838709677419e-05, + "loss": 0.2173, + "step": 8407 + }, + { + "epoch": 0.134528, + "grad_norm": 0.6640625, + "learning_rate": 8.724677419354839e-05, + "loss": 0.1856, + "step": 8408 + }, + { + "epoch": 0.134544, + "grad_norm": 0.9453125, + "learning_rate": 8.724516129032259e-05, + "loss": 0.1683, + "step": 8409 + }, + { + "epoch": 0.13456, + "grad_norm": 0.77734375, + "learning_rate": 8.724354838709678e-05, + "loss": 0.176, + "step": 8410 + }, + { + "epoch": 0.134576, + "grad_norm": 0.74609375, + "learning_rate": 8.724193548387098e-05, + "loss": 0.1888, + "step": 8411 + }, + { + "epoch": 0.134592, + "grad_norm": 0.890625, + "learning_rate": 8.724032258064516e-05, + "loss": 0.1995, + "step": 8412 + }, + { + "epoch": 0.134608, + "grad_norm": 0.51953125, + "learning_rate": 8.723870967741936e-05, + "loss": 0.1232, + "step": 8413 + }, + { + "epoch": 0.134624, + "grad_norm": 1.34375, + "learning_rate": 8.723709677419355e-05, + "loss": 0.176, + "step": 8414 + }, + { + "epoch": 0.13464, + "grad_norm": 0.64453125, + "learning_rate": 8.723548387096775e-05, + "loss": 0.1755, + "step": 8415 + }, + { + "epoch": 0.134656, + "grad_norm": 0.8046875, + "learning_rate": 8.723387096774194e-05, + "loss": 0.1641, + "step": 8416 + }, + { + "epoch": 0.134672, + "grad_norm": 0.98828125, + "learning_rate": 8.723225806451613e-05, + "loss": 0.1786, + "step": 8417 + }, + { + "epoch": 0.134688, + "grad_norm": 0.75, + "learning_rate": 8.723064516129032e-05, + "loss": 0.1801, + "step": 8418 + }, + { + "epoch": 0.134704, + "grad_norm": 1.03125, + "learning_rate": 8.722903225806452e-05, + "loss": 0.1813, + "step": 8419 + }, + { + "epoch": 0.13472, + "grad_norm": 0.68359375, + "learning_rate": 8.72274193548387e-05, + "loss": 0.1978, + "step": 8420 + }, + { + "epoch": 0.134736, + "grad_norm": 1.0859375, + "learning_rate": 8.72258064516129e-05, + "loss": 0.2548, + "step": 8421 + }, + { + "epoch": 0.134752, + "grad_norm": 0.80859375, + "learning_rate": 8.72241935483871e-05, + "loss": 0.197, + "step": 8422 + }, + { + "epoch": 0.134768, + "grad_norm": 0.78515625, + "learning_rate": 8.722258064516129e-05, + "loss": 0.1482, + "step": 8423 + }, + { + "epoch": 0.134784, + "grad_norm": 0.8046875, + "learning_rate": 8.722096774193549e-05, + "loss": 0.1535, + "step": 8424 + }, + { + "epoch": 0.1348, + "grad_norm": 0.8046875, + "learning_rate": 8.721935483870968e-05, + "loss": 0.1667, + "step": 8425 + }, + { + "epoch": 0.134816, + "grad_norm": 1.0390625, + "learning_rate": 8.721774193548388e-05, + "loss": 0.139, + "step": 8426 + }, + { + "epoch": 0.134832, + "grad_norm": 1.0390625, + "learning_rate": 8.721612903225806e-05, + "loss": 0.1912, + "step": 8427 + }, + { + "epoch": 0.134848, + "grad_norm": 1.0625, + "learning_rate": 8.721451612903226e-05, + "loss": 0.1537, + "step": 8428 + }, + { + "epoch": 0.134864, + "grad_norm": 0.9609375, + "learning_rate": 8.721290322580645e-05, + "loss": 0.163, + "step": 8429 + }, + { + "epoch": 0.13488, + "grad_norm": 0.890625, + "learning_rate": 8.721129032258065e-05, + "loss": 0.2029, + "step": 8430 + }, + { + "epoch": 0.134896, + "grad_norm": 0.447265625, + "learning_rate": 8.720967741935483e-05, + "loss": 0.1482, + "step": 8431 + }, + { + "epoch": 0.134912, + "grad_norm": 0.9609375, + "learning_rate": 8.720806451612903e-05, + "loss": 0.2038, + "step": 8432 + }, + { + "epoch": 0.134928, + "grad_norm": 0.578125, + "learning_rate": 8.720645161290323e-05, + "loss": 0.1368, + "step": 8433 + }, + { + "epoch": 0.134944, + "grad_norm": 0.984375, + "learning_rate": 8.720483870967743e-05, + "loss": 0.1958, + "step": 8434 + }, + { + "epoch": 0.13496, + "grad_norm": 0.77734375, + "learning_rate": 8.720322580645162e-05, + "loss": 0.2156, + "step": 8435 + }, + { + "epoch": 0.134976, + "grad_norm": 0.97265625, + "learning_rate": 8.720161290322582e-05, + "loss": 0.1864, + "step": 8436 + }, + { + "epoch": 0.134992, + "grad_norm": 0.5546875, + "learning_rate": 8.72e-05, + "loss": 0.1651, + "step": 8437 + }, + { + "epoch": 0.135008, + "grad_norm": 0.75390625, + "learning_rate": 8.719838709677419e-05, + "loss": 0.1907, + "step": 8438 + }, + { + "epoch": 0.135024, + "grad_norm": 0.91796875, + "learning_rate": 8.719677419354839e-05, + "loss": 0.2103, + "step": 8439 + }, + { + "epoch": 0.13504, + "grad_norm": 0.72265625, + "learning_rate": 8.719516129032258e-05, + "loss": 0.2096, + "step": 8440 + }, + { + "epoch": 0.135056, + "grad_norm": 0.94921875, + "learning_rate": 8.719354838709678e-05, + "loss": 0.1856, + "step": 8441 + }, + { + "epoch": 0.135072, + "grad_norm": 0.8203125, + "learning_rate": 8.719193548387096e-05, + "loss": 0.1855, + "step": 8442 + }, + { + "epoch": 0.135088, + "grad_norm": 0.796875, + "learning_rate": 8.719032258064516e-05, + "loss": 0.164, + "step": 8443 + }, + { + "epoch": 0.135104, + "grad_norm": 0.7734375, + "learning_rate": 8.718870967741936e-05, + "loss": 0.1321, + "step": 8444 + }, + { + "epoch": 0.13512, + "grad_norm": 0.71875, + "learning_rate": 8.718709677419356e-05, + "loss": 0.1569, + "step": 8445 + }, + { + "epoch": 0.135136, + "grad_norm": 1.0390625, + "learning_rate": 8.718548387096775e-05, + "loss": 0.1452, + "step": 8446 + }, + { + "epoch": 0.135152, + "grad_norm": 0.6796875, + "learning_rate": 8.718387096774195e-05, + "loss": 0.1724, + "step": 8447 + }, + { + "epoch": 0.135168, + "grad_norm": 0.78125, + "learning_rate": 8.718225806451613e-05, + "loss": 0.1672, + "step": 8448 + }, + { + "epoch": 0.135184, + "grad_norm": 0.7109375, + "learning_rate": 8.718064516129033e-05, + "loss": 0.1491, + "step": 8449 + }, + { + "epoch": 0.1352, + "grad_norm": 0.6171875, + "learning_rate": 8.717903225806452e-05, + "loss": 0.1695, + "step": 8450 + }, + { + "epoch": 0.135216, + "grad_norm": 0.73828125, + "learning_rate": 8.717741935483872e-05, + "loss": 0.1588, + "step": 8451 + }, + { + "epoch": 0.135232, + "grad_norm": 0.94140625, + "learning_rate": 8.71758064516129e-05, + "loss": 0.1741, + "step": 8452 + }, + { + "epoch": 0.135248, + "grad_norm": 0.71484375, + "learning_rate": 8.71741935483871e-05, + "loss": 0.1207, + "step": 8453 + }, + { + "epoch": 0.135264, + "grad_norm": 0.765625, + "learning_rate": 8.717258064516129e-05, + "loss": 0.1599, + "step": 8454 + }, + { + "epoch": 0.13528, + "grad_norm": 0.859375, + "learning_rate": 8.717096774193548e-05, + "loss": 0.1489, + "step": 8455 + }, + { + "epoch": 0.135296, + "grad_norm": 1.109375, + "learning_rate": 8.716935483870968e-05, + "loss": 0.1952, + "step": 8456 + }, + { + "epoch": 0.135312, + "grad_norm": 0.6484375, + "learning_rate": 8.716774193548388e-05, + "loss": 0.1412, + "step": 8457 + }, + { + "epoch": 0.135328, + "grad_norm": 0.81640625, + "learning_rate": 8.716612903225808e-05, + "loss": 0.165, + "step": 8458 + }, + { + "epoch": 0.135344, + "grad_norm": 0.91015625, + "learning_rate": 8.716451612903226e-05, + "loss": 0.1987, + "step": 8459 + }, + { + "epoch": 0.13536, + "grad_norm": 0.91015625, + "learning_rate": 8.716290322580646e-05, + "loss": 0.1725, + "step": 8460 + }, + { + "epoch": 0.135376, + "grad_norm": 1.1328125, + "learning_rate": 8.716129032258065e-05, + "loss": 0.2352, + "step": 8461 + }, + { + "epoch": 0.135392, + "grad_norm": 0.94140625, + "learning_rate": 8.715967741935485e-05, + "loss": 0.1905, + "step": 8462 + }, + { + "epoch": 0.135408, + "grad_norm": 0.62109375, + "learning_rate": 8.715806451612903e-05, + "loss": 0.1735, + "step": 8463 + }, + { + "epoch": 0.135424, + "grad_norm": 0.61328125, + "learning_rate": 8.715645161290323e-05, + "loss": 0.1667, + "step": 8464 + }, + { + "epoch": 0.13544, + "grad_norm": 0.76171875, + "learning_rate": 8.715483870967742e-05, + "loss": 0.1364, + "step": 8465 + }, + { + "epoch": 0.135456, + "grad_norm": 0.890625, + "learning_rate": 8.715322580645162e-05, + "loss": 0.1797, + "step": 8466 + }, + { + "epoch": 0.135472, + "grad_norm": 0.8515625, + "learning_rate": 8.71516129032258e-05, + "loss": 0.1572, + "step": 8467 + }, + { + "epoch": 0.135488, + "grad_norm": 0.85546875, + "learning_rate": 8.715e-05, + "loss": 0.1822, + "step": 8468 + }, + { + "epoch": 0.135504, + "grad_norm": 0.74609375, + "learning_rate": 8.71483870967742e-05, + "loss": 0.1469, + "step": 8469 + }, + { + "epoch": 0.13552, + "grad_norm": 1.28125, + "learning_rate": 8.714677419354839e-05, + "loss": 0.1773, + "step": 8470 + }, + { + "epoch": 0.135536, + "grad_norm": 0.671875, + "learning_rate": 8.714516129032259e-05, + "loss": 0.1546, + "step": 8471 + }, + { + "epoch": 0.135552, + "grad_norm": 0.83984375, + "learning_rate": 8.714354838709678e-05, + "loss": 0.1653, + "step": 8472 + }, + { + "epoch": 0.135568, + "grad_norm": 0.61328125, + "learning_rate": 8.714193548387098e-05, + "loss": 0.1839, + "step": 8473 + }, + { + "epoch": 0.135584, + "grad_norm": 0.87890625, + "learning_rate": 8.714032258064516e-05, + "loss": 0.2019, + "step": 8474 + }, + { + "epoch": 0.1356, + "grad_norm": 0.74609375, + "learning_rate": 8.713870967741936e-05, + "loss": 0.1738, + "step": 8475 + }, + { + "epoch": 0.135616, + "grad_norm": 1.1015625, + "learning_rate": 8.713709677419355e-05, + "loss": 0.17, + "step": 8476 + }, + { + "epoch": 0.135632, + "grad_norm": 1.03125, + "learning_rate": 8.713548387096775e-05, + "loss": 0.1729, + "step": 8477 + }, + { + "epoch": 0.135648, + "grad_norm": 0.6484375, + "learning_rate": 8.713387096774193e-05, + "loss": 0.1862, + "step": 8478 + }, + { + "epoch": 0.135664, + "grad_norm": 1.25, + "learning_rate": 8.713225806451613e-05, + "loss": 0.1715, + "step": 8479 + }, + { + "epoch": 0.13568, + "grad_norm": 0.90234375, + "learning_rate": 8.713064516129033e-05, + "loss": 0.202, + "step": 8480 + }, + { + "epoch": 0.135696, + "grad_norm": 0.94140625, + "learning_rate": 8.712903225806452e-05, + "loss": 0.1698, + "step": 8481 + }, + { + "epoch": 0.135712, + "grad_norm": 0.83984375, + "learning_rate": 8.712741935483872e-05, + "loss": 0.1745, + "step": 8482 + }, + { + "epoch": 0.135728, + "grad_norm": 0.70703125, + "learning_rate": 8.712580645161292e-05, + "loss": 0.1683, + "step": 8483 + }, + { + "epoch": 0.135744, + "grad_norm": 0.73828125, + "learning_rate": 8.71241935483871e-05, + "loss": 0.1731, + "step": 8484 + }, + { + "epoch": 0.13576, + "grad_norm": 0.73828125, + "learning_rate": 8.712258064516129e-05, + "loss": 0.1523, + "step": 8485 + }, + { + "epoch": 0.135776, + "grad_norm": 1.4765625, + "learning_rate": 8.712096774193549e-05, + "loss": 0.2475, + "step": 8486 + }, + { + "epoch": 0.135792, + "grad_norm": 0.99609375, + "learning_rate": 8.711935483870968e-05, + "loss": 0.1915, + "step": 8487 + }, + { + "epoch": 0.135808, + "grad_norm": 1.1171875, + "learning_rate": 8.711774193548387e-05, + "loss": 0.2, + "step": 8488 + }, + { + "epoch": 0.135824, + "grad_norm": 0.7578125, + "learning_rate": 8.711612903225806e-05, + "loss": 0.2043, + "step": 8489 + }, + { + "epoch": 0.13584, + "grad_norm": 1.0234375, + "learning_rate": 8.711451612903226e-05, + "loss": 0.2108, + "step": 8490 + }, + { + "epoch": 0.135856, + "grad_norm": 0.80859375, + "learning_rate": 8.711290322580645e-05, + "loss": 0.1563, + "step": 8491 + }, + { + "epoch": 0.135872, + "grad_norm": 0.66796875, + "learning_rate": 8.711129032258065e-05, + "loss": 0.1549, + "step": 8492 + }, + { + "epoch": 0.135888, + "grad_norm": 0.7890625, + "learning_rate": 8.710967741935485e-05, + "loss": 0.1994, + "step": 8493 + }, + { + "epoch": 0.135904, + "grad_norm": 0.68359375, + "learning_rate": 8.710806451612905e-05, + "loss": 0.1597, + "step": 8494 + }, + { + "epoch": 0.13592, + "grad_norm": 0.66015625, + "learning_rate": 8.710645161290323e-05, + "loss": 0.1637, + "step": 8495 + }, + { + "epoch": 0.135936, + "grad_norm": 0.77734375, + "learning_rate": 8.710483870967743e-05, + "loss": 0.1582, + "step": 8496 + }, + { + "epoch": 0.135952, + "grad_norm": 0.65625, + "learning_rate": 8.710322580645162e-05, + "loss": 0.1577, + "step": 8497 + }, + { + "epoch": 0.135968, + "grad_norm": 0.70703125, + "learning_rate": 8.710161290322582e-05, + "loss": 0.161, + "step": 8498 + }, + { + "epoch": 0.135984, + "grad_norm": 0.91015625, + "learning_rate": 8.71e-05, + "loss": 0.1663, + "step": 8499 + }, + { + "epoch": 0.136, + "grad_norm": 0.59375, + "learning_rate": 8.709838709677419e-05, + "loss": 0.1792, + "step": 8500 + }, + { + "epoch": 0.136016, + "grad_norm": 0.93359375, + "learning_rate": 8.709677419354839e-05, + "loss": 0.1762, + "step": 8501 + }, + { + "epoch": 0.136032, + "grad_norm": 0.59765625, + "learning_rate": 8.709516129032257e-05, + "loss": 0.1848, + "step": 8502 + }, + { + "epoch": 0.136048, + "grad_norm": 0.62890625, + "learning_rate": 8.709354838709677e-05, + "loss": 0.1749, + "step": 8503 + }, + { + "epoch": 0.136064, + "grad_norm": 0.66796875, + "learning_rate": 8.709193548387097e-05, + "loss": 0.215, + "step": 8504 + }, + { + "epoch": 0.13608, + "grad_norm": 0.9765625, + "learning_rate": 8.709032258064517e-05, + "loss": 0.1746, + "step": 8505 + }, + { + "epoch": 0.136096, + "grad_norm": 0.90625, + "learning_rate": 8.708870967741936e-05, + "loss": 0.1973, + "step": 8506 + }, + { + "epoch": 0.136112, + "grad_norm": 0.87109375, + "learning_rate": 8.708709677419356e-05, + "loss": 0.1573, + "step": 8507 + }, + { + "epoch": 0.136128, + "grad_norm": 0.87109375, + "learning_rate": 8.708548387096775e-05, + "loss": 0.1606, + "step": 8508 + }, + { + "epoch": 0.136144, + "grad_norm": 1.4296875, + "learning_rate": 8.708387096774195e-05, + "loss": 0.202, + "step": 8509 + }, + { + "epoch": 0.13616, + "grad_norm": 1.140625, + "learning_rate": 8.708225806451613e-05, + "loss": 0.1737, + "step": 8510 + }, + { + "epoch": 0.136176, + "grad_norm": 0.56640625, + "learning_rate": 8.708064516129033e-05, + "loss": 0.1789, + "step": 8511 + }, + { + "epoch": 0.136192, + "grad_norm": 0.8515625, + "learning_rate": 8.707903225806452e-05, + "loss": 0.1485, + "step": 8512 + }, + { + "epoch": 0.136208, + "grad_norm": 0.921875, + "learning_rate": 8.707741935483872e-05, + "loss": 0.2047, + "step": 8513 + }, + { + "epoch": 0.136224, + "grad_norm": 0.953125, + "learning_rate": 8.70758064516129e-05, + "loss": 0.1745, + "step": 8514 + }, + { + "epoch": 0.13624, + "grad_norm": 2.625, + "learning_rate": 8.70741935483871e-05, + "loss": 0.2464, + "step": 8515 + }, + { + "epoch": 0.136256, + "grad_norm": 0.5703125, + "learning_rate": 8.707258064516129e-05, + "loss": 0.1538, + "step": 8516 + }, + { + "epoch": 0.136272, + "grad_norm": 0.7890625, + "learning_rate": 8.707096774193549e-05, + "loss": 0.1457, + "step": 8517 + }, + { + "epoch": 0.136288, + "grad_norm": 0.7734375, + "learning_rate": 8.706935483870969e-05, + "loss": 0.1759, + "step": 8518 + }, + { + "epoch": 0.136304, + "grad_norm": 0.84765625, + "learning_rate": 8.706774193548387e-05, + "loss": 0.166, + "step": 8519 + }, + { + "epoch": 0.13632, + "grad_norm": 0.796875, + "learning_rate": 8.706612903225807e-05, + "loss": 0.1952, + "step": 8520 + }, + { + "epoch": 0.136336, + "grad_norm": 0.78125, + "learning_rate": 8.706451612903226e-05, + "loss": 0.2024, + "step": 8521 + }, + { + "epoch": 0.136352, + "grad_norm": 1.125, + "learning_rate": 8.706290322580646e-05, + "loss": 0.1884, + "step": 8522 + }, + { + "epoch": 0.136368, + "grad_norm": 1.046875, + "learning_rate": 8.706129032258064e-05, + "loss": 0.2081, + "step": 8523 + }, + { + "epoch": 0.136384, + "grad_norm": 1.4375, + "learning_rate": 8.705967741935484e-05, + "loss": 0.2138, + "step": 8524 + }, + { + "epoch": 0.1364, + "grad_norm": 0.56640625, + "learning_rate": 8.705806451612903e-05, + "loss": 0.1563, + "step": 8525 + }, + { + "epoch": 0.136416, + "grad_norm": 1.3203125, + "learning_rate": 8.705645161290323e-05, + "loss": 0.1661, + "step": 8526 + }, + { + "epoch": 0.136432, + "grad_norm": 0.625, + "learning_rate": 8.705483870967742e-05, + "loss": 0.1866, + "step": 8527 + }, + { + "epoch": 0.136448, + "grad_norm": 0.671875, + "learning_rate": 8.705322580645162e-05, + "loss": 0.1825, + "step": 8528 + }, + { + "epoch": 0.136464, + "grad_norm": 0.75, + "learning_rate": 8.705161290322582e-05, + "loss": 0.1597, + "step": 8529 + }, + { + "epoch": 0.13648, + "grad_norm": 0.99609375, + "learning_rate": 8.705000000000002e-05, + "loss": 0.1596, + "step": 8530 + }, + { + "epoch": 0.136496, + "grad_norm": 0.68359375, + "learning_rate": 8.70483870967742e-05, + "loss": 0.1835, + "step": 8531 + }, + { + "epoch": 0.136512, + "grad_norm": 0.69921875, + "learning_rate": 8.704677419354839e-05, + "loss": 0.1691, + "step": 8532 + }, + { + "epoch": 0.136528, + "grad_norm": 0.55859375, + "learning_rate": 8.704516129032259e-05, + "loss": 0.1688, + "step": 8533 + }, + { + "epoch": 0.136544, + "grad_norm": 0.6171875, + "learning_rate": 8.704354838709677e-05, + "loss": 0.1784, + "step": 8534 + }, + { + "epoch": 0.13656, + "grad_norm": 0.53125, + "learning_rate": 8.704193548387097e-05, + "loss": 0.156, + "step": 8535 + }, + { + "epoch": 0.136576, + "grad_norm": 0.625, + "learning_rate": 8.704032258064516e-05, + "loss": 0.1659, + "step": 8536 + }, + { + "epoch": 0.136592, + "grad_norm": 0.78515625, + "learning_rate": 8.703870967741936e-05, + "loss": 0.1718, + "step": 8537 + }, + { + "epoch": 0.136608, + "grad_norm": 0.74609375, + "learning_rate": 8.703709677419354e-05, + "loss": 0.1831, + "step": 8538 + }, + { + "epoch": 0.136624, + "grad_norm": 0.703125, + "learning_rate": 8.703548387096774e-05, + "loss": 0.1601, + "step": 8539 + }, + { + "epoch": 0.13664, + "grad_norm": 1.046875, + "learning_rate": 8.703387096774194e-05, + "loss": 0.1773, + "step": 8540 + }, + { + "epoch": 0.136656, + "grad_norm": 1.0703125, + "learning_rate": 8.703225806451614e-05, + "loss": 0.1595, + "step": 8541 + }, + { + "epoch": 0.136672, + "grad_norm": 0.76953125, + "learning_rate": 8.703064516129033e-05, + "loss": 0.1435, + "step": 8542 + }, + { + "epoch": 0.136688, + "grad_norm": 0.86328125, + "learning_rate": 8.702903225806453e-05, + "loss": 0.1622, + "step": 8543 + }, + { + "epoch": 0.136704, + "grad_norm": 0.578125, + "learning_rate": 8.702741935483872e-05, + "loss": 0.187, + "step": 8544 + }, + { + "epoch": 0.13672, + "grad_norm": 1.1796875, + "learning_rate": 8.702580645161291e-05, + "loss": 0.1431, + "step": 8545 + }, + { + "epoch": 0.136736, + "grad_norm": 0.8515625, + "learning_rate": 8.70241935483871e-05, + "loss": 0.1894, + "step": 8546 + }, + { + "epoch": 0.136752, + "grad_norm": 1.2421875, + "learning_rate": 8.702258064516129e-05, + "loss": 0.2029, + "step": 8547 + }, + { + "epoch": 0.136768, + "grad_norm": 0.71875, + "learning_rate": 8.702096774193549e-05, + "loss": 0.123, + "step": 8548 + }, + { + "epoch": 0.136784, + "grad_norm": 0.9609375, + "learning_rate": 8.701935483870967e-05, + "loss": 0.1913, + "step": 8549 + }, + { + "epoch": 0.1368, + "grad_norm": 0.48046875, + "learning_rate": 8.701774193548387e-05, + "loss": 0.1316, + "step": 8550 + }, + { + "epoch": 0.136816, + "grad_norm": 0.66015625, + "learning_rate": 8.701612903225806e-05, + "loss": 0.15, + "step": 8551 + }, + { + "epoch": 0.136832, + "grad_norm": 1.640625, + "learning_rate": 8.701451612903226e-05, + "loss": 0.1938, + "step": 8552 + }, + { + "epoch": 0.136848, + "grad_norm": 0.921875, + "learning_rate": 8.701290322580646e-05, + "loss": 0.1856, + "step": 8553 + }, + { + "epoch": 0.136864, + "grad_norm": 1.125, + "learning_rate": 8.701129032258066e-05, + "loss": 0.1752, + "step": 8554 + }, + { + "epoch": 0.13688, + "grad_norm": 0.5078125, + "learning_rate": 8.700967741935484e-05, + "loss": 0.1568, + "step": 8555 + }, + { + "epoch": 0.136896, + "grad_norm": 0.73046875, + "learning_rate": 8.700806451612904e-05, + "loss": 0.1652, + "step": 8556 + }, + { + "epoch": 0.136912, + "grad_norm": 0.59765625, + "learning_rate": 8.700645161290323e-05, + "loss": 0.1809, + "step": 8557 + }, + { + "epoch": 0.136928, + "grad_norm": 0.95703125, + "learning_rate": 8.700483870967743e-05, + "loss": 0.1769, + "step": 8558 + }, + { + "epoch": 0.136944, + "grad_norm": 0.88671875, + "learning_rate": 8.700322580645161e-05, + "loss": 0.1936, + "step": 8559 + }, + { + "epoch": 0.13696, + "grad_norm": 1.1328125, + "learning_rate": 8.700161290322581e-05, + "loss": 0.2299, + "step": 8560 + }, + { + "epoch": 0.136976, + "grad_norm": 0.85546875, + "learning_rate": 8.7e-05, + "loss": 0.1778, + "step": 8561 + }, + { + "epoch": 0.136992, + "grad_norm": 1.3125, + "learning_rate": 8.69983870967742e-05, + "loss": 0.1994, + "step": 8562 + }, + { + "epoch": 0.137008, + "grad_norm": 0.578125, + "learning_rate": 8.699677419354839e-05, + "loss": 0.1672, + "step": 8563 + }, + { + "epoch": 0.137024, + "grad_norm": 0.86328125, + "learning_rate": 8.699516129032259e-05, + "loss": 0.2266, + "step": 8564 + }, + { + "epoch": 0.13704, + "grad_norm": 0.5546875, + "learning_rate": 8.699354838709679e-05, + "loss": 0.1584, + "step": 8565 + }, + { + "epoch": 0.137056, + "grad_norm": 0.734375, + "learning_rate": 8.699193548387097e-05, + "loss": 0.1523, + "step": 8566 + }, + { + "epoch": 0.137072, + "grad_norm": 0.77734375, + "learning_rate": 8.699032258064517e-05, + "loss": 0.1954, + "step": 8567 + }, + { + "epoch": 0.137088, + "grad_norm": 0.8125, + "learning_rate": 8.698870967741936e-05, + "loss": 0.2008, + "step": 8568 + }, + { + "epoch": 0.137104, + "grad_norm": 0.5703125, + "learning_rate": 8.698709677419356e-05, + "loss": 0.1464, + "step": 8569 + }, + { + "epoch": 0.13712, + "grad_norm": 0.5859375, + "learning_rate": 8.698548387096774e-05, + "loss": 0.1553, + "step": 8570 + }, + { + "epoch": 0.137136, + "grad_norm": 0.5546875, + "learning_rate": 8.698387096774194e-05, + "loss": 0.1572, + "step": 8571 + }, + { + "epoch": 0.137152, + "grad_norm": 0.66015625, + "learning_rate": 8.698225806451613e-05, + "loss": 0.1563, + "step": 8572 + }, + { + "epoch": 0.137168, + "grad_norm": 0.9921875, + "learning_rate": 8.698064516129033e-05, + "loss": 0.1756, + "step": 8573 + }, + { + "epoch": 0.137184, + "grad_norm": 1.234375, + "learning_rate": 8.697903225806451e-05, + "loss": 0.2124, + "step": 8574 + }, + { + "epoch": 0.1372, + "grad_norm": 0.90234375, + "learning_rate": 8.697741935483871e-05, + "loss": 0.1709, + "step": 8575 + }, + { + "epoch": 0.137216, + "grad_norm": 1.328125, + "learning_rate": 8.69758064516129e-05, + "loss": 0.1777, + "step": 8576 + }, + { + "epoch": 0.137232, + "grad_norm": 0.6875, + "learning_rate": 8.69741935483871e-05, + "loss": 0.154, + "step": 8577 + }, + { + "epoch": 0.137248, + "grad_norm": 0.703125, + "learning_rate": 8.69725806451613e-05, + "loss": 0.1861, + "step": 8578 + }, + { + "epoch": 0.137264, + "grad_norm": 0.7421875, + "learning_rate": 8.697096774193549e-05, + "loss": 0.1829, + "step": 8579 + }, + { + "epoch": 0.13728, + "grad_norm": 0.6171875, + "learning_rate": 8.696935483870969e-05, + "loss": 0.1468, + "step": 8580 + }, + { + "epoch": 0.137296, + "grad_norm": 0.64453125, + "learning_rate": 8.696774193548387e-05, + "loss": 0.1994, + "step": 8581 + }, + { + "epoch": 0.137312, + "grad_norm": 1.1328125, + "learning_rate": 8.696612903225807e-05, + "loss": 0.1923, + "step": 8582 + }, + { + "epoch": 0.137328, + "grad_norm": 0.7421875, + "learning_rate": 8.696451612903226e-05, + "loss": 0.1793, + "step": 8583 + }, + { + "epoch": 0.137344, + "grad_norm": 1.1171875, + "learning_rate": 8.696290322580646e-05, + "loss": 0.1702, + "step": 8584 + }, + { + "epoch": 0.13736, + "grad_norm": 1.140625, + "learning_rate": 8.696129032258064e-05, + "loss": 0.1617, + "step": 8585 + }, + { + "epoch": 0.137376, + "grad_norm": 0.427734375, + "learning_rate": 8.695967741935484e-05, + "loss": 0.1439, + "step": 8586 + }, + { + "epoch": 0.137392, + "grad_norm": 0.74609375, + "learning_rate": 8.695806451612903e-05, + "loss": 0.2048, + "step": 8587 + }, + { + "epoch": 0.137408, + "grad_norm": 0.83203125, + "learning_rate": 8.695645161290323e-05, + "loss": 0.1805, + "step": 8588 + }, + { + "epoch": 0.137424, + "grad_norm": 0.8203125, + "learning_rate": 8.695483870967743e-05, + "loss": 0.1615, + "step": 8589 + }, + { + "epoch": 0.13744, + "grad_norm": 1.1953125, + "learning_rate": 8.695322580645163e-05, + "loss": 0.204, + "step": 8590 + }, + { + "epoch": 0.137456, + "grad_norm": 0.78515625, + "learning_rate": 8.695161290322581e-05, + "loss": 0.1456, + "step": 8591 + }, + { + "epoch": 0.137472, + "grad_norm": 0.71875, + "learning_rate": 8.695000000000001e-05, + "loss": 0.1825, + "step": 8592 + }, + { + "epoch": 0.137488, + "grad_norm": 1.1328125, + "learning_rate": 8.69483870967742e-05, + "loss": 0.2245, + "step": 8593 + }, + { + "epoch": 0.137504, + "grad_norm": 0.703125, + "learning_rate": 8.694677419354839e-05, + "loss": 0.1883, + "step": 8594 + }, + { + "epoch": 0.13752, + "grad_norm": 0.60546875, + "learning_rate": 8.694516129032258e-05, + "loss": 0.1654, + "step": 8595 + }, + { + "epoch": 0.137536, + "grad_norm": 0.86328125, + "learning_rate": 8.694354838709677e-05, + "loss": 0.1578, + "step": 8596 + }, + { + "epoch": 0.137552, + "grad_norm": 0.55859375, + "learning_rate": 8.694193548387097e-05, + "loss": 0.1673, + "step": 8597 + }, + { + "epoch": 0.137568, + "grad_norm": 0.8046875, + "learning_rate": 8.694032258064516e-05, + "loss": 0.1641, + "step": 8598 + }, + { + "epoch": 0.137584, + "grad_norm": 0.7734375, + "learning_rate": 8.693870967741936e-05, + "loss": 0.1587, + "step": 8599 + }, + { + "epoch": 0.1376, + "grad_norm": 0.640625, + "learning_rate": 8.693709677419356e-05, + "loss": 0.1696, + "step": 8600 + }, + { + "epoch": 0.137616, + "grad_norm": 0.6953125, + "learning_rate": 8.693548387096776e-05, + "loss": 0.1695, + "step": 8601 + }, + { + "epoch": 0.137632, + "grad_norm": 0.7890625, + "learning_rate": 8.693387096774194e-05, + "loss": 0.1303, + "step": 8602 + }, + { + "epoch": 0.137648, + "grad_norm": 0.79296875, + "learning_rate": 8.693225806451614e-05, + "loss": 0.2117, + "step": 8603 + }, + { + "epoch": 0.137664, + "grad_norm": 0.92578125, + "learning_rate": 8.693064516129033e-05, + "loss": 0.1873, + "step": 8604 + }, + { + "epoch": 0.13768, + "grad_norm": 0.796875, + "learning_rate": 8.692903225806453e-05, + "loss": 0.1689, + "step": 8605 + }, + { + "epoch": 0.137696, + "grad_norm": 0.7265625, + "learning_rate": 8.692741935483871e-05, + "loss": 0.1638, + "step": 8606 + }, + { + "epoch": 0.137712, + "grad_norm": 0.93359375, + "learning_rate": 8.692580645161291e-05, + "loss": 0.1719, + "step": 8607 + }, + { + "epoch": 0.137728, + "grad_norm": 0.796875, + "learning_rate": 8.69241935483871e-05, + "loss": 0.1996, + "step": 8608 + }, + { + "epoch": 0.137744, + "grad_norm": 0.578125, + "learning_rate": 8.692258064516128e-05, + "loss": 0.1709, + "step": 8609 + }, + { + "epoch": 0.13776, + "grad_norm": 0.85546875, + "learning_rate": 8.692096774193548e-05, + "loss": 0.1656, + "step": 8610 + }, + { + "epoch": 0.137776, + "grad_norm": 0.6640625, + "learning_rate": 8.691935483870967e-05, + "loss": 0.1337, + "step": 8611 + }, + { + "epoch": 0.137792, + "grad_norm": 0.78515625, + "learning_rate": 8.691774193548387e-05, + "loss": 0.2081, + "step": 8612 + }, + { + "epoch": 0.137808, + "grad_norm": 0.71484375, + "learning_rate": 8.691612903225807e-05, + "loss": 0.1801, + "step": 8613 + }, + { + "epoch": 0.137824, + "grad_norm": 0.609375, + "learning_rate": 8.691451612903227e-05, + "loss": 0.1559, + "step": 8614 + }, + { + "epoch": 0.13784, + "grad_norm": 0.8203125, + "learning_rate": 8.691290322580646e-05, + "loss": 0.1214, + "step": 8615 + }, + { + "epoch": 0.137856, + "grad_norm": 0.85546875, + "learning_rate": 8.691129032258065e-05, + "loss": 0.1448, + "step": 8616 + }, + { + "epoch": 0.137872, + "grad_norm": 0.84375, + "learning_rate": 8.690967741935484e-05, + "loss": 0.1885, + "step": 8617 + }, + { + "epoch": 0.137888, + "grad_norm": 0.7421875, + "learning_rate": 8.690806451612904e-05, + "loss": 0.1813, + "step": 8618 + }, + { + "epoch": 0.137904, + "grad_norm": 0.68359375, + "learning_rate": 8.690645161290323e-05, + "loss": 0.1352, + "step": 8619 + }, + { + "epoch": 0.13792, + "grad_norm": 0.478515625, + "learning_rate": 8.690483870967743e-05, + "loss": 0.1447, + "step": 8620 + }, + { + "epoch": 0.137936, + "grad_norm": 0.7578125, + "learning_rate": 8.690322580645161e-05, + "loss": 0.1472, + "step": 8621 + }, + { + "epoch": 0.137952, + "grad_norm": 1.078125, + "learning_rate": 8.690161290322581e-05, + "loss": 0.207, + "step": 8622 + }, + { + "epoch": 0.137968, + "grad_norm": 1.2265625, + "learning_rate": 8.69e-05, + "loss": 0.2033, + "step": 8623 + }, + { + "epoch": 0.137984, + "grad_norm": 0.92578125, + "learning_rate": 8.68983870967742e-05, + "loss": 0.1297, + "step": 8624 + }, + { + "epoch": 0.138, + "grad_norm": 0.7890625, + "learning_rate": 8.68967741935484e-05, + "loss": 0.1643, + "step": 8625 + }, + { + "epoch": 0.138016, + "grad_norm": 0.82421875, + "learning_rate": 8.689516129032258e-05, + "loss": 0.1735, + "step": 8626 + }, + { + "epoch": 0.138032, + "grad_norm": 1.125, + "learning_rate": 8.689354838709678e-05, + "loss": 0.1861, + "step": 8627 + }, + { + "epoch": 0.138048, + "grad_norm": 1.1640625, + "learning_rate": 8.689193548387097e-05, + "loss": 0.1997, + "step": 8628 + }, + { + "epoch": 0.138064, + "grad_norm": 0.8046875, + "learning_rate": 8.689032258064517e-05, + "loss": 0.1388, + "step": 8629 + }, + { + "epoch": 0.13808, + "grad_norm": 0.88671875, + "learning_rate": 8.688870967741935e-05, + "loss": 0.1469, + "step": 8630 + }, + { + "epoch": 0.138096, + "grad_norm": 1.1953125, + "learning_rate": 8.688709677419355e-05, + "loss": 0.1726, + "step": 8631 + }, + { + "epoch": 0.138112, + "grad_norm": 0.74609375, + "learning_rate": 8.688548387096774e-05, + "loss": 0.1959, + "step": 8632 + }, + { + "epoch": 0.138128, + "grad_norm": 1.296875, + "learning_rate": 8.688387096774194e-05, + "loss": 0.2022, + "step": 8633 + }, + { + "epoch": 0.138144, + "grad_norm": 0.6484375, + "learning_rate": 8.688225806451613e-05, + "loss": 0.1612, + "step": 8634 + }, + { + "epoch": 0.13816, + "grad_norm": 0.61328125, + "learning_rate": 8.688064516129033e-05, + "loss": 0.1574, + "step": 8635 + }, + { + "epoch": 0.138176, + "grad_norm": 0.56640625, + "learning_rate": 8.687903225806453e-05, + "loss": 0.1646, + "step": 8636 + }, + { + "epoch": 0.138192, + "grad_norm": 0.53515625, + "learning_rate": 8.687741935483871e-05, + "loss": 0.1367, + "step": 8637 + }, + { + "epoch": 0.138208, + "grad_norm": 0.69140625, + "learning_rate": 8.687580645161291e-05, + "loss": 0.1693, + "step": 8638 + }, + { + "epoch": 0.138224, + "grad_norm": 0.60546875, + "learning_rate": 8.687419354838711e-05, + "loss": 0.1449, + "step": 8639 + }, + { + "epoch": 0.13824, + "grad_norm": 0.92578125, + "learning_rate": 8.68725806451613e-05, + "loss": 0.1701, + "step": 8640 + }, + { + "epoch": 0.138256, + "grad_norm": 0.73046875, + "learning_rate": 8.687096774193548e-05, + "loss": 0.1751, + "step": 8641 + }, + { + "epoch": 0.138272, + "grad_norm": 1.4375, + "learning_rate": 8.686935483870968e-05, + "loss": 0.2088, + "step": 8642 + }, + { + "epoch": 0.138288, + "grad_norm": 0.9765625, + "learning_rate": 8.686774193548387e-05, + "loss": 0.1875, + "step": 8643 + }, + { + "epoch": 0.138304, + "grad_norm": 1.328125, + "learning_rate": 8.686612903225807e-05, + "loss": 0.1897, + "step": 8644 + }, + { + "epoch": 0.13832, + "grad_norm": 0.69921875, + "learning_rate": 8.686451612903225e-05, + "loss": 0.1694, + "step": 8645 + }, + { + "epoch": 0.138336, + "grad_norm": 0.6015625, + "learning_rate": 8.686290322580645e-05, + "loss": 0.1642, + "step": 8646 + }, + { + "epoch": 0.138352, + "grad_norm": 0.5625, + "learning_rate": 8.686129032258064e-05, + "loss": 0.1469, + "step": 8647 + }, + { + "epoch": 0.138368, + "grad_norm": 0.609375, + "learning_rate": 8.685967741935484e-05, + "loss": 0.1411, + "step": 8648 + }, + { + "epoch": 0.138384, + "grad_norm": 0.625, + "learning_rate": 8.685806451612904e-05, + "loss": 0.1763, + "step": 8649 + }, + { + "epoch": 0.1384, + "grad_norm": 0.64453125, + "learning_rate": 8.685645161290324e-05, + "loss": 0.163, + "step": 8650 + }, + { + "epoch": 0.138416, + "grad_norm": 1.59375, + "learning_rate": 8.685483870967743e-05, + "loss": 0.1819, + "step": 8651 + }, + { + "epoch": 0.138432, + "grad_norm": 0.578125, + "learning_rate": 8.685322580645162e-05, + "loss": 0.1754, + "step": 8652 + }, + { + "epoch": 0.138448, + "grad_norm": 0.84375, + "learning_rate": 8.685161290322581e-05, + "loss": 0.1636, + "step": 8653 + }, + { + "epoch": 0.138464, + "grad_norm": 0.6015625, + "learning_rate": 8.685000000000001e-05, + "loss": 0.1368, + "step": 8654 + }, + { + "epoch": 0.13848, + "grad_norm": 0.921875, + "learning_rate": 8.68483870967742e-05, + "loss": 0.1331, + "step": 8655 + }, + { + "epoch": 0.138496, + "grad_norm": 0.8828125, + "learning_rate": 8.684677419354838e-05, + "loss": 0.1649, + "step": 8656 + }, + { + "epoch": 0.138512, + "grad_norm": 0.59375, + "learning_rate": 8.684516129032258e-05, + "loss": 0.1739, + "step": 8657 + }, + { + "epoch": 0.138528, + "grad_norm": 0.9921875, + "learning_rate": 8.684354838709677e-05, + "loss": 0.1589, + "step": 8658 + }, + { + "epoch": 0.138544, + "grad_norm": 0.59375, + "learning_rate": 8.684193548387097e-05, + "loss": 0.1364, + "step": 8659 + }, + { + "epoch": 0.13856, + "grad_norm": 0.69140625, + "learning_rate": 8.684032258064517e-05, + "loss": 0.1661, + "step": 8660 + }, + { + "epoch": 0.138576, + "grad_norm": 0.6328125, + "learning_rate": 8.683870967741937e-05, + "loss": 0.1354, + "step": 8661 + }, + { + "epoch": 0.138592, + "grad_norm": 0.94921875, + "learning_rate": 8.683709677419355e-05, + "loss": 0.1891, + "step": 8662 + }, + { + "epoch": 0.138608, + "grad_norm": 0.8671875, + "learning_rate": 8.683548387096775e-05, + "loss": 0.1817, + "step": 8663 + }, + { + "epoch": 0.138624, + "grad_norm": 0.66015625, + "learning_rate": 8.683387096774194e-05, + "loss": 0.1435, + "step": 8664 + }, + { + "epoch": 0.13864, + "grad_norm": 0.67578125, + "learning_rate": 8.683225806451614e-05, + "loss": 0.152, + "step": 8665 + }, + { + "epoch": 0.138656, + "grad_norm": 0.97265625, + "learning_rate": 8.683064516129032e-05, + "loss": 0.2128, + "step": 8666 + }, + { + "epoch": 0.138672, + "grad_norm": 0.66796875, + "learning_rate": 8.682903225806452e-05, + "loss": 0.1915, + "step": 8667 + }, + { + "epoch": 0.138688, + "grad_norm": 0.7109375, + "learning_rate": 8.682741935483871e-05, + "loss": 0.1652, + "step": 8668 + }, + { + "epoch": 0.138704, + "grad_norm": 0.83984375, + "learning_rate": 8.682580645161291e-05, + "loss": 0.1888, + "step": 8669 + }, + { + "epoch": 0.13872, + "grad_norm": 1.03125, + "learning_rate": 8.68241935483871e-05, + "loss": 0.1493, + "step": 8670 + }, + { + "epoch": 0.138736, + "grad_norm": 0.67578125, + "learning_rate": 8.68225806451613e-05, + "loss": 0.1588, + "step": 8671 + }, + { + "epoch": 0.138752, + "grad_norm": 0.9609375, + "learning_rate": 8.682096774193548e-05, + "loss": 0.2025, + "step": 8672 + }, + { + "epoch": 0.138768, + "grad_norm": 0.9375, + "learning_rate": 8.681935483870968e-05, + "loss": 0.1489, + "step": 8673 + }, + { + "epoch": 0.138784, + "grad_norm": 0.7890625, + "learning_rate": 8.681774193548388e-05, + "loss": 0.1731, + "step": 8674 + }, + { + "epoch": 0.1388, + "grad_norm": 1.0078125, + "learning_rate": 8.681612903225807e-05, + "loss": 0.204, + "step": 8675 + }, + { + "epoch": 0.138816, + "grad_norm": 0.72265625, + "learning_rate": 8.681451612903227e-05, + "loss": 0.157, + "step": 8676 + }, + { + "epoch": 0.138832, + "grad_norm": 1.0859375, + "learning_rate": 8.681290322580645e-05, + "loss": 0.1861, + "step": 8677 + }, + { + "epoch": 0.138848, + "grad_norm": 0.94140625, + "learning_rate": 8.681129032258065e-05, + "loss": 0.1533, + "step": 8678 + }, + { + "epoch": 0.138864, + "grad_norm": 1.1796875, + "learning_rate": 8.680967741935484e-05, + "loss": 0.2042, + "step": 8679 + }, + { + "epoch": 0.13888, + "grad_norm": 0.99609375, + "learning_rate": 8.680806451612904e-05, + "loss": 0.1806, + "step": 8680 + }, + { + "epoch": 0.138896, + "grad_norm": 0.8359375, + "learning_rate": 8.680645161290322e-05, + "loss": 0.1757, + "step": 8681 + }, + { + "epoch": 0.138912, + "grad_norm": 1.578125, + "learning_rate": 8.680483870967742e-05, + "loss": 0.1966, + "step": 8682 + }, + { + "epoch": 0.138928, + "grad_norm": 0.70703125, + "learning_rate": 8.680322580645161e-05, + "loss": 0.14, + "step": 8683 + }, + { + "epoch": 0.138944, + "grad_norm": 1.0078125, + "learning_rate": 8.680161290322581e-05, + "loss": 0.1383, + "step": 8684 + }, + { + "epoch": 0.13896, + "grad_norm": 0.94140625, + "learning_rate": 8.680000000000001e-05, + "loss": 0.2251, + "step": 8685 + }, + { + "epoch": 0.138976, + "grad_norm": 1.078125, + "learning_rate": 8.679838709677421e-05, + "loss": 0.2191, + "step": 8686 + }, + { + "epoch": 0.138992, + "grad_norm": 0.71484375, + "learning_rate": 8.67967741935484e-05, + "loss": 0.1817, + "step": 8687 + }, + { + "epoch": 0.139008, + "grad_norm": 1.5546875, + "learning_rate": 8.679516129032258e-05, + "loss": 0.2383, + "step": 8688 + }, + { + "epoch": 0.139024, + "grad_norm": 1.0859375, + "learning_rate": 8.679354838709678e-05, + "loss": 0.1675, + "step": 8689 + }, + { + "epoch": 0.13904, + "grad_norm": 1.25, + "learning_rate": 8.679193548387097e-05, + "loss": 0.2285, + "step": 8690 + }, + { + "epoch": 0.139056, + "grad_norm": 1.515625, + "learning_rate": 8.679032258064517e-05, + "loss": 0.1908, + "step": 8691 + }, + { + "epoch": 0.139072, + "grad_norm": 0.94140625, + "learning_rate": 8.678870967741935e-05, + "loss": 0.1831, + "step": 8692 + }, + { + "epoch": 0.139088, + "grad_norm": 0.578125, + "learning_rate": 8.678709677419355e-05, + "loss": 0.1865, + "step": 8693 + }, + { + "epoch": 0.139104, + "grad_norm": 0.87890625, + "learning_rate": 8.678548387096774e-05, + "loss": 0.1678, + "step": 8694 + }, + { + "epoch": 0.13912, + "grad_norm": 0.76953125, + "learning_rate": 8.678387096774194e-05, + "loss": 0.1976, + "step": 8695 + }, + { + "epoch": 0.139136, + "grad_norm": 0.6640625, + "learning_rate": 8.678225806451614e-05, + "loss": 0.1713, + "step": 8696 + }, + { + "epoch": 0.139152, + "grad_norm": 0.87890625, + "learning_rate": 8.678064516129034e-05, + "loss": 0.1393, + "step": 8697 + }, + { + "epoch": 0.139168, + "grad_norm": 0.71875, + "learning_rate": 8.677903225806452e-05, + "loss": 0.1906, + "step": 8698 + }, + { + "epoch": 0.139184, + "grad_norm": 0.84375, + "learning_rate": 8.677741935483872e-05, + "loss": 0.1516, + "step": 8699 + }, + { + "epoch": 0.1392, + "grad_norm": 0.96875, + "learning_rate": 8.677580645161291e-05, + "loss": 0.1813, + "step": 8700 + }, + { + "epoch": 0.139216, + "grad_norm": 1.203125, + "learning_rate": 8.677419354838711e-05, + "loss": 0.1982, + "step": 8701 + }, + { + "epoch": 0.139232, + "grad_norm": 0.77734375, + "learning_rate": 8.67725806451613e-05, + "loss": 0.1729, + "step": 8702 + }, + { + "epoch": 0.139248, + "grad_norm": 0.9453125, + "learning_rate": 8.677096774193548e-05, + "loss": 0.1405, + "step": 8703 + }, + { + "epoch": 0.139264, + "grad_norm": 0.73828125, + "learning_rate": 8.676935483870968e-05, + "loss": 0.2066, + "step": 8704 + }, + { + "epoch": 0.13928, + "grad_norm": 0.671875, + "learning_rate": 8.676774193548387e-05, + "loss": 0.1973, + "step": 8705 + }, + { + "epoch": 0.139296, + "grad_norm": 0.80859375, + "learning_rate": 8.676612903225807e-05, + "loss": 0.1455, + "step": 8706 + }, + { + "epoch": 0.139312, + "grad_norm": 0.796875, + "learning_rate": 8.676451612903225e-05, + "loss": 0.1778, + "step": 8707 + }, + { + "epoch": 0.139328, + "grad_norm": 0.453125, + "learning_rate": 8.676290322580645e-05, + "loss": 0.1264, + "step": 8708 + }, + { + "epoch": 0.139344, + "grad_norm": 0.84375, + "learning_rate": 8.676129032258065e-05, + "loss": 0.2068, + "step": 8709 + }, + { + "epoch": 0.13936, + "grad_norm": 1.0078125, + "learning_rate": 8.675967741935485e-05, + "loss": 0.2002, + "step": 8710 + }, + { + "epoch": 0.139376, + "grad_norm": 0.80078125, + "learning_rate": 8.675806451612904e-05, + "loss": 0.1926, + "step": 8711 + }, + { + "epoch": 0.139392, + "grad_norm": 0.6953125, + "learning_rate": 8.675645161290324e-05, + "loss": 0.1653, + "step": 8712 + }, + { + "epoch": 0.139408, + "grad_norm": 0.57421875, + "learning_rate": 8.675483870967742e-05, + "loss": 0.1845, + "step": 8713 + }, + { + "epoch": 0.139424, + "grad_norm": 0.6484375, + "learning_rate": 8.675322580645162e-05, + "loss": 0.168, + "step": 8714 + }, + { + "epoch": 0.13944, + "grad_norm": 0.546875, + "learning_rate": 8.675161290322581e-05, + "loss": 0.1532, + "step": 8715 + }, + { + "epoch": 0.139456, + "grad_norm": 1.0234375, + "learning_rate": 8.675000000000001e-05, + "loss": 0.1703, + "step": 8716 + }, + { + "epoch": 0.139472, + "grad_norm": 0.7109375, + "learning_rate": 8.67483870967742e-05, + "loss": 0.192, + "step": 8717 + }, + { + "epoch": 0.139488, + "grad_norm": 0.76171875, + "learning_rate": 8.674677419354838e-05, + "loss": 0.1478, + "step": 8718 + }, + { + "epoch": 0.139504, + "grad_norm": 0.68359375, + "learning_rate": 8.674516129032258e-05, + "loss": 0.192, + "step": 8719 + }, + { + "epoch": 0.13952, + "grad_norm": 0.69921875, + "learning_rate": 8.674354838709678e-05, + "loss": 0.1422, + "step": 8720 + }, + { + "epoch": 0.139536, + "grad_norm": 0.84375, + "learning_rate": 8.674193548387098e-05, + "loss": 0.1796, + "step": 8721 + }, + { + "epoch": 0.139552, + "grad_norm": 0.91015625, + "learning_rate": 8.674032258064517e-05, + "loss": 0.1576, + "step": 8722 + }, + { + "epoch": 0.139568, + "grad_norm": 1.171875, + "learning_rate": 8.673870967741936e-05, + "loss": 0.2285, + "step": 8723 + }, + { + "epoch": 0.139584, + "grad_norm": 0.58984375, + "learning_rate": 8.673709677419355e-05, + "loss": 0.1648, + "step": 8724 + }, + { + "epoch": 0.1396, + "grad_norm": 0.640625, + "learning_rate": 8.673548387096775e-05, + "loss": 0.1837, + "step": 8725 + }, + { + "epoch": 0.139616, + "grad_norm": 0.75, + "learning_rate": 8.673387096774194e-05, + "loss": 0.2049, + "step": 8726 + }, + { + "epoch": 0.139632, + "grad_norm": 0.435546875, + "learning_rate": 8.673225806451614e-05, + "loss": 0.144, + "step": 8727 + }, + { + "epoch": 0.139648, + "grad_norm": 1.265625, + "learning_rate": 8.673064516129032e-05, + "loss": 0.1624, + "step": 8728 + }, + { + "epoch": 0.139664, + "grad_norm": 0.8984375, + "learning_rate": 8.672903225806452e-05, + "loss": 0.1719, + "step": 8729 + }, + { + "epoch": 0.13968, + "grad_norm": 1.0546875, + "learning_rate": 8.672741935483871e-05, + "loss": 0.2195, + "step": 8730 + }, + { + "epoch": 0.139696, + "grad_norm": 0.87890625, + "learning_rate": 8.672580645161291e-05, + "loss": 0.1628, + "step": 8731 + }, + { + "epoch": 0.139712, + "grad_norm": 0.6875, + "learning_rate": 8.672419354838711e-05, + "loss": 0.1945, + "step": 8732 + }, + { + "epoch": 0.139728, + "grad_norm": 0.5703125, + "learning_rate": 8.67225806451613e-05, + "loss": 0.1693, + "step": 8733 + }, + { + "epoch": 0.139744, + "grad_norm": 0.96484375, + "learning_rate": 8.672096774193549e-05, + "loss": 0.1835, + "step": 8734 + }, + { + "epoch": 0.13976, + "grad_norm": 0.7578125, + "learning_rate": 8.671935483870968e-05, + "loss": 0.2075, + "step": 8735 + }, + { + "epoch": 0.139776, + "grad_norm": 0.98828125, + "learning_rate": 8.671774193548388e-05, + "loss": 0.1581, + "step": 8736 + }, + { + "epoch": 0.139792, + "grad_norm": 0.97265625, + "learning_rate": 8.671612903225806e-05, + "loss": 0.1942, + "step": 8737 + }, + { + "epoch": 0.139808, + "grad_norm": 0.5546875, + "learning_rate": 8.671451612903226e-05, + "loss": 0.1587, + "step": 8738 + }, + { + "epoch": 0.139824, + "grad_norm": 0.60546875, + "learning_rate": 8.671290322580645e-05, + "loss": 0.1572, + "step": 8739 + }, + { + "epoch": 0.13984, + "grad_norm": 0.484375, + "learning_rate": 8.671129032258065e-05, + "loss": 0.1668, + "step": 8740 + }, + { + "epoch": 0.139856, + "grad_norm": 1.4921875, + "learning_rate": 8.670967741935484e-05, + "loss": 0.183, + "step": 8741 + }, + { + "epoch": 0.139872, + "grad_norm": 0.984375, + "learning_rate": 8.670806451612904e-05, + "loss": 0.1739, + "step": 8742 + }, + { + "epoch": 0.139888, + "grad_norm": 0.63671875, + "learning_rate": 8.670645161290322e-05, + "loss": 0.1447, + "step": 8743 + }, + { + "epoch": 0.139904, + "grad_norm": 0.71484375, + "learning_rate": 8.670483870967742e-05, + "loss": 0.1462, + "step": 8744 + }, + { + "epoch": 0.13992, + "grad_norm": 0.796875, + "learning_rate": 8.670322580645162e-05, + "loss": 0.2006, + "step": 8745 + }, + { + "epoch": 0.139936, + "grad_norm": 0.86328125, + "learning_rate": 8.670161290322582e-05, + "loss": 0.1969, + "step": 8746 + }, + { + "epoch": 0.139952, + "grad_norm": 0.75, + "learning_rate": 8.67e-05, + "loss": 0.1867, + "step": 8747 + }, + { + "epoch": 0.139968, + "grad_norm": 0.66796875, + "learning_rate": 8.66983870967742e-05, + "loss": 0.1756, + "step": 8748 + }, + { + "epoch": 0.139984, + "grad_norm": 0.86328125, + "learning_rate": 8.669677419354839e-05, + "loss": 0.1742, + "step": 8749 + }, + { + "epoch": 0.14, + "grad_norm": 0.515625, + "learning_rate": 8.669516129032258e-05, + "loss": 0.1548, + "step": 8750 + }, + { + "epoch": 0.140016, + "grad_norm": 0.8515625, + "learning_rate": 8.669354838709678e-05, + "loss": 0.1984, + "step": 8751 + }, + { + "epoch": 0.140032, + "grad_norm": 0.94921875, + "learning_rate": 8.669193548387096e-05, + "loss": 0.2087, + "step": 8752 + }, + { + "epoch": 0.140048, + "grad_norm": 0.65234375, + "learning_rate": 8.669032258064516e-05, + "loss": 0.1726, + "step": 8753 + }, + { + "epoch": 0.140064, + "grad_norm": 0.52734375, + "learning_rate": 8.668870967741935e-05, + "loss": 0.1446, + "step": 8754 + }, + { + "epoch": 0.14008, + "grad_norm": 0.93359375, + "learning_rate": 8.668709677419355e-05, + "loss": 0.1555, + "step": 8755 + }, + { + "epoch": 0.140096, + "grad_norm": 1.0234375, + "learning_rate": 8.668548387096775e-05, + "loss": 0.1553, + "step": 8756 + }, + { + "epoch": 0.140112, + "grad_norm": 1.2265625, + "learning_rate": 8.668387096774195e-05, + "loss": 0.219, + "step": 8757 + }, + { + "epoch": 0.140128, + "grad_norm": 1.0625, + "learning_rate": 8.668225806451613e-05, + "loss": 0.2073, + "step": 8758 + }, + { + "epoch": 0.140144, + "grad_norm": 1.046875, + "learning_rate": 8.668064516129033e-05, + "loss": 0.1656, + "step": 8759 + }, + { + "epoch": 0.14016, + "grad_norm": 1.2734375, + "learning_rate": 8.667903225806452e-05, + "loss": 0.2119, + "step": 8760 + }, + { + "epoch": 0.140176, + "grad_norm": 0.75, + "learning_rate": 8.667741935483872e-05, + "loss": 0.1783, + "step": 8761 + }, + { + "epoch": 0.140192, + "grad_norm": 0.74609375, + "learning_rate": 8.66758064516129e-05, + "loss": 0.1895, + "step": 8762 + }, + { + "epoch": 0.140208, + "grad_norm": 0.80859375, + "learning_rate": 8.66741935483871e-05, + "loss": 0.1589, + "step": 8763 + }, + { + "epoch": 0.140224, + "grad_norm": 1.2265625, + "learning_rate": 8.667258064516129e-05, + "loss": 0.1621, + "step": 8764 + }, + { + "epoch": 0.14024, + "grad_norm": 0.73828125, + "learning_rate": 8.667096774193548e-05, + "loss": 0.1682, + "step": 8765 + }, + { + "epoch": 0.140256, + "grad_norm": 0.74609375, + "learning_rate": 8.666935483870968e-05, + "loss": 0.1583, + "step": 8766 + }, + { + "epoch": 0.140272, + "grad_norm": 0.81640625, + "learning_rate": 8.666774193548386e-05, + "loss": 0.1978, + "step": 8767 + }, + { + "epoch": 0.140288, + "grad_norm": 0.66015625, + "learning_rate": 8.666612903225806e-05, + "loss": 0.1533, + "step": 8768 + }, + { + "epoch": 0.140304, + "grad_norm": 0.6484375, + "learning_rate": 8.666451612903226e-05, + "loss": 0.1764, + "step": 8769 + }, + { + "epoch": 0.14032, + "grad_norm": 0.7578125, + "learning_rate": 8.666290322580646e-05, + "loss": 0.1476, + "step": 8770 + }, + { + "epoch": 0.140336, + "grad_norm": 1.0390625, + "learning_rate": 8.666129032258065e-05, + "loss": 0.1782, + "step": 8771 + }, + { + "epoch": 0.140352, + "grad_norm": 0.703125, + "learning_rate": 8.665967741935485e-05, + "loss": 0.1957, + "step": 8772 + }, + { + "epoch": 0.140368, + "grad_norm": 0.67578125, + "learning_rate": 8.665806451612903e-05, + "loss": 0.1238, + "step": 8773 + }, + { + "epoch": 0.140384, + "grad_norm": 0.953125, + "learning_rate": 8.665645161290323e-05, + "loss": 0.1654, + "step": 8774 + }, + { + "epoch": 0.1404, + "grad_norm": 1.1015625, + "learning_rate": 8.665483870967742e-05, + "loss": 0.1547, + "step": 8775 + }, + { + "epoch": 0.140416, + "grad_norm": 0.73828125, + "learning_rate": 8.665322580645162e-05, + "loss": 0.1911, + "step": 8776 + }, + { + "epoch": 0.140432, + "grad_norm": 0.90234375, + "learning_rate": 8.66516129032258e-05, + "loss": 0.1914, + "step": 8777 + }, + { + "epoch": 0.140448, + "grad_norm": 0.80859375, + "learning_rate": 8.665e-05, + "loss": 0.1907, + "step": 8778 + }, + { + "epoch": 0.140464, + "grad_norm": 0.765625, + "learning_rate": 8.664838709677419e-05, + "loss": 0.1849, + "step": 8779 + }, + { + "epoch": 0.14048, + "grad_norm": 0.96875, + "learning_rate": 8.664677419354839e-05, + "loss": 0.1366, + "step": 8780 + }, + { + "epoch": 0.140496, + "grad_norm": 0.73828125, + "learning_rate": 8.664516129032259e-05, + "loss": 0.1524, + "step": 8781 + }, + { + "epoch": 0.140512, + "grad_norm": 0.59375, + "learning_rate": 8.664354838709678e-05, + "loss": 0.1781, + "step": 8782 + }, + { + "epoch": 0.140528, + "grad_norm": 0.78125, + "learning_rate": 8.664193548387098e-05, + "loss": 0.1816, + "step": 8783 + }, + { + "epoch": 0.140544, + "grad_norm": 1.09375, + "learning_rate": 8.664032258064516e-05, + "loss": 0.168, + "step": 8784 + }, + { + "epoch": 0.14056, + "grad_norm": 0.51171875, + "learning_rate": 8.663870967741936e-05, + "loss": 0.169, + "step": 8785 + }, + { + "epoch": 0.140576, + "grad_norm": 0.89453125, + "learning_rate": 8.663709677419355e-05, + "loss": 0.1611, + "step": 8786 + }, + { + "epoch": 0.140592, + "grad_norm": 1.0625, + "learning_rate": 8.663548387096775e-05, + "loss": 0.2421, + "step": 8787 + }, + { + "epoch": 0.140608, + "grad_norm": 0.7578125, + "learning_rate": 8.663387096774193e-05, + "loss": 0.1773, + "step": 8788 + }, + { + "epoch": 0.140624, + "grad_norm": 0.83203125, + "learning_rate": 8.663225806451613e-05, + "loss": 0.1761, + "step": 8789 + }, + { + "epoch": 0.14064, + "grad_norm": 1.3125, + "learning_rate": 8.663064516129032e-05, + "loss": 0.1778, + "step": 8790 + }, + { + "epoch": 0.140656, + "grad_norm": 1.3984375, + "learning_rate": 8.662903225806452e-05, + "loss": 0.1829, + "step": 8791 + }, + { + "epoch": 0.140672, + "grad_norm": 0.859375, + "learning_rate": 8.662741935483872e-05, + "loss": 0.168, + "step": 8792 + }, + { + "epoch": 0.140688, + "grad_norm": 0.9375, + "learning_rate": 8.662580645161292e-05, + "loss": 0.2003, + "step": 8793 + }, + { + "epoch": 0.140704, + "grad_norm": 1.1796875, + "learning_rate": 8.66241935483871e-05, + "loss": 0.1744, + "step": 8794 + }, + { + "epoch": 0.14072, + "grad_norm": 1.1328125, + "learning_rate": 8.66225806451613e-05, + "loss": 0.1671, + "step": 8795 + }, + { + "epoch": 0.140736, + "grad_norm": 1.4921875, + "learning_rate": 8.662096774193549e-05, + "loss": 0.166, + "step": 8796 + }, + { + "epoch": 0.140752, + "grad_norm": 0.8671875, + "learning_rate": 8.661935483870968e-05, + "loss": 0.1703, + "step": 8797 + }, + { + "epoch": 0.140768, + "grad_norm": 0.80078125, + "learning_rate": 8.661774193548388e-05, + "loss": 0.2124, + "step": 8798 + }, + { + "epoch": 0.140784, + "grad_norm": 1.140625, + "learning_rate": 8.661612903225806e-05, + "loss": 0.1787, + "step": 8799 + }, + { + "epoch": 0.1408, + "grad_norm": 0.640625, + "learning_rate": 8.661451612903226e-05, + "loss": 0.187, + "step": 8800 + }, + { + "epoch": 0.140816, + "grad_norm": 0.8125, + "learning_rate": 8.661290322580645e-05, + "loss": 0.1976, + "step": 8801 + }, + { + "epoch": 0.140832, + "grad_norm": 1.734375, + "learning_rate": 8.661129032258065e-05, + "loss": 0.19, + "step": 8802 + }, + { + "epoch": 0.140848, + "grad_norm": 0.7265625, + "learning_rate": 8.660967741935483e-05, + "loss": 0.1745, + "step": 8803 + }, + { + "epoch": 0.140864, + "grad_norm": 0.92578125, + "learning_rate": 8.660806451612903e-05, + "loss": 0.1838, + "step": 8804 + }, + { + "epoch": 0.14088, + "grad_norm": 1.0078125, + "learning_rate": 8.660645161290323e-05, + "loss": 0.1449, + "step": 8805 + }, + { + "epoch": 0.140896, + "grad_norm": 1.296875, + "learning_rate": 8.660483870967743e-05, + "loss": 0.1752, + "step": 8806 + }, + { + "epoch": 0.140912, + "grad_norm": 0.86328125, + "learning_rate": 8.660322580645162e-05, + "loss": 0.1627, + "step": 8807 + }, + { + "epoch": 0.140928, + "grad_norm": 0.75, + "learning_rate": 8.660161290322582e-05, + "loss": 0.1971, + "step": 8808 + }, + { + "epoch": 0.140944, + "grad_norm": 1.640625, + "learning_rate": 8.66e-05, + "loss": 0.2178, + "step": 8809 + }, + { + "epoch": 0.14096, + "grad_norm": 0.640625, + "learning_rate": 8.65983870967742e-05, + "loss": 0.1406, + "step": 8810 + }, + { + "epoch": 0.140976, + "grad_norm": 0.60546875, + "learning_rate": 8.659677419354839e-05, + "loss": 0.1498, + "step": 8811 + }, + { + "epoch": 0.140992, + "grad_norm": 1.71875, + "learning_rate": 8.659516129032258e-05, + "loss": 0.1789, + "step": 8812 + }, + { + "epoch": 0.141008, + "grad_norm": 1.4140625, + "learning_rate": 8.659354838709678e-05, + "loss": 0.1511, + "step": 8813 + }, + { + "epoch": 0.141024, + "grad_norm": 0.62109375, + "learning_rate": 8.659193548387096e-05, + "loss": 0.1848, + "step": 8814 + }, + { + "epoch": 0.14104, + "grad_norm": 1.515625, + "learning_rate": 8.659032258064516e-05, + "loss": 0.2342, + "step": 8815 + }, + { + "epoch": 0.141056, + "grad_norm": 0.890625, + "learning_rate": 8.658870967741936e-05, + "loss": 0.2001, + "step": 8816 + }, + { + "epoch": 0.141072, + "grad_norm": 1.25, + "learning_rate": 8.658709677419356e-05, + "loss": 0.1667, + "step": 8817 + }, + { + "epoch": 0.141088, + "grad_norm": 1.1171875, + "learning_rate": 8.658548387096775e-05, + "loss": 0.2103, + "step": 8818 + }, + { + "epoch": 0.141104, + "grad_norm": 1.0703125, + "learning_rate": 8.658387096774195e-05, + "loss": 0.1499, + "step": 8819 + }, + { + "epoch": 0.14112, + "grad_norm": 0.9296875, + "learning_rate": 8.658225806451613e-05, + "loss": 0.1693, + "step": 8820 + }, + { + "epoch": 0.141136, + "grad_norm": 0.63671875, + "learning_rate": 8.658064516129033e-05, + "loss": 0.1831, + "step": 8821 + }, + { + "epoch": 0.141152, + "grad_norm": 1.453125, + "learning_rate": 8.657903225806452e-05, + "loss": 0.1687, + "step": 8822 + }, + { + "epoch": 0.141168, + "grad_norm": 0.59375, + "learning_rate": 8.657741935483872e-05, + "loss": 0.1341, + "step": 8823 + }, + { + "epoch": 0.141184, + "grad_norm": 0.7421875, + "learning_rate": 8.65758064516129e-05, + "loss": 0.1753, + "step": 8824 + }, + { + "epoch": 0.1412, + "grad_norm": 1.046875, + "learning_rate": 8.65741935483871e-05, + "loss": 0.172, + "step": 8825 + }, + { + "epoch": 0.141216, + "grad_norm": 0.68359375, + "learning_rate": 8.657258064516129e-05, + "loss": 0.1389, + "step": 8826 + }, + { + "epoch": 0.141232, + "grad_norm": 1.296875, + "learning_rate": 8.657096774193549e-05, + "loss": 0.2189, + "step": 8827 + }, + { + "epoch": 0.141248, + "grad_norm": 0.85546875, + "learning_rate": 8.656935483870968e-05, + "loss": 0.182, + "step": 8828 + }, + { + "epoch": 0.141264, + "grad_norm": 0.70703125, + "learning_rate": 8.656774193548387e-05, + "loss": 0.175, + "step": 8829 + }, + { + "epoch": 0.14128, + "grad_norm": 1.625, + "learning_rate": 8.656612903225807e-05, + "loss": 0.2024, + "step": 8830 + }, + { + "epoch": 0.141296, + "grad_norm": 1.1953125, + "learning_rate": 8.656451612903226e-05, + "loss": 0.1843, + "step": 8831 + }, + { + "epoch": 0.141312, + "grad_norm": 0.96484375, + "learning_rate": 8.656290322580646e-05, + "loss": 0.1725, + "step": 8832 + }, + { + "epoch": 0.141328, + "grad_norm": 1.6015625, + "learning_rate": 8.656129032258065e-05, + "loss": 0.1765, + "step": 8833 + }, + { + "epoch": 0.141344, + "grad_norm": 0.6953125, + "learning_rate": 8.655967741935485e-05, + "loss": 0.1918, + "step": 8834 + }, + { + "epoch": 0.14136, + "grad_norm": 0.68359375, + "learning_rate": 8.655806451612903e-05, + "loss": 0.1732, + "step": 8835 + }, + { + "epoch": 0.141376, + "grad_norm": 0.97265625, + "learning_rate": 8.655645161290323e-05, + "loss": 0.186, + "step": 8836 + }, + { + "epoch": 0.141392, + "grad_norm": 0.99609375, + "learning_rate": 8.655483870967742e-05, + "loss": 0.2286, + "step": 8837 + }, + { + "epoch": 0.141408, + "grad_norm": 1.609375, + "learning_rate": 8.655322580645162e-05, + "loss": 0.1999, + "step": 8838 + }, + { + "epoch": 0.141424, + "grad_norm": 0.73046875, + "learning_rate": 8.65516129032258e-05, + "loss": 0.1334, + "step": 8839 + }, + { + "epoch": 0.14144, + "grad_norm": 1.03125, + "learning_rate": 8.655e-05, + "loss": 0.1369, + "step": 8840 + }, + { + "epoch": 0.141456, + "grad_norm": 1.109375, + "learning_rate": 8.65483870967742e-05, + "loss": 0.2098, + "step": 8841 + }, + { + "epoch": 0.141472, + "grad_norm": 0.75, + "learning_rate": 8.65467741935484e-05, + "loss": 0.1397, + "step": 8842 + }, + { + "epoch": 0.141488, + "grad_norm": 0.828125, + "learning_rate": 8.654516129032259e-05, + "loss": 0.1846, + "step": 8843 + }, + { + "epoch": 0.141504, + "grad_norm": 0.75, + "learning_rate": 8.654354838709677e-05, + "loss": 0.1943, + "step": 8844 + }, + { + "epoch": 0.14152, + "grad_norm": 0.640625, + "learning_rate": 8.654193548387097e-05, + "loss": 0.1824, + "step": 8845 + }, + { + "epoch": 0.141536, + "grad_norm": 0.98828125, + "learning_rate": 8.654032258064516e-05, + "loss": 0.2136, + "step": 8846 + }, + { + "epoch": 0.141552, + "grad_norm": 0.76171875, + "learning_rate": 8.653870967741936e-05, + "loss": 0.2081, + "step": 8847 + }, + { + "epoch": 0.141568, + "grad_norm": 0.70703125, + "learning_rate": 8.653709677419355e-05, + "loss": 0.1618, + "step": 8848 + }, + { + "epoch": 0.141584, + "grad_norm": 0.6328125, + "learning_rate": 8.653548387096775e-05, + "loss": 0.1595, + "step": 8849 + }, + { + "epoch": 0.1416, + "grad_norm": 0.5234375, + "learning_rate": 8.653387096774193e-05, + "loss": 0.1513, + "step": 8850 + }, + { + "epoch": 0.141616, + "grad_norm": 0.72265625, + "learning_rate": 8.653225806451613e-05, + "loss": 0.1976, + "step": 8851 + }, + { + "epoch": 0.141632, + "grad_norm": 0.90625, + "learning_rate": 8.653064516129033e-05, + "loss": 0.187, + "step": 8852 + }, + { + "epoch": 0.141648, + "grad_norm": 0.84765625, + "learning_rate": 8.652903225806453e-05, + "loss": 0.1668, + "step": 8853 + }, + { + "epoch": 0.141664, + "grad_norm": 0.69921875, + "learning_rate": 8.652741935483872e-05, + "loss": 0.1307, + "step": 8854 + }, + { + "epoch": 0.14168, + "grad_norm": 0.8125, + "learning_rate": 8.652580645161292e-05, + "loss": 0.1916, + "step": 8855 + }, + { + "epoch": 0.141696, + "grad_norm": 1.171875, + "learning_rate": 8.65241935483871e-05, + "loss": 0.1826, + "step": 8856 + }, + { + "epoch": 0.141712, + "grad_norm": 0.7265625, + "learning_rate": 8.65225806451613e-05, + "loss": 0.197, + "step": 8857 + }, + { + "epoch": 0.141728, + "grad_norm": 1.3359375, + "learning_rate": 8.652096774193549e-05, + "loss": 0.1891, + "step": 8858 + }, + { + "epoch": 0.141744, + "grad_norm": 0.89453125, + "learning_rate": 8.651935483870967e-05, + "loss": 0.1756, + "step": 8859 + }, + { + "epoch": 0.14176, + "grad_norm": 1.8828125, + "learning_rate": 8.651774193548387e-05, + "loss": 0.1948, + "step": 8860 + }, + { + "epoch": 0.141776, + "grad_norm": 0.6796875, + "learning_rate": 8.651612903225806e-05, + "loss": 0.1854, + "step": 8861 + }, + { + "epoch": 0.141792, + "grad_norm": 0.75, + "learning_rate": 8.651451612903226e-05, + "loss": 0.1343, + "step": 8862 + }, + { + "epoch": 0.141808, + "grad_norm": 0.5859375, + "learning_rate": 8.651290322580645e-05, + "loss": 0.192, + "step": 8863 + }, + { + "epoch": 0.141824, + "grad_norm": 0.6640625, + "learning_rate": 8.651129032258065e-05, + "loss": 0.1342, + "step": 8864 + }, + { + "epoch": 0.14184, + "grad_norm": 1.1015625, + "learning_rate": 8.650967741935484e-05, + "loss": 0.196, + "step": 8865 + }, + { + "epoch": 0.141856, + "grad_norm": 0.7734375, + "learning_rate": 8.650806451612904e-05, + "loss": 0.1526, + "step": 8866 + }, + { + "epoch": 0.141872, + "grad_norm": 0.6875, + "learning_rate": 8.650645161290323e-05, + "loss": 0.1888, + "step": 8867 + }, + { + "epoch": 0.141888, + "grad_norm": 0.54296875, + "learning_rate": 8.650483870967743e-05, + "loss": 0.1741, + "step": 8868 + }, + { + "epoch": 0.141904, + "grad_norm": 0.90234375, + "learning_rate": 8.650322580645162e-05, + "loss": 0.186, + "step": 8869 + }, + { + "epoch": 0.14192, + "grad_norm": 1.0625, + "learning_rate": 8.650161290322582e-05, + "loss": 0.2051, + "step": 8870 + }, + { + "epoch": 0.141936, + "grad_norm": 0.7578125, + "learning_rate": 8.65e-05, + "loss": 0.1694, + "step": 8871 + }, + { + "epoch": 0.141952, + "grad_norm": 0.97265625, + "learning_rate": 8.64983870967742e-05, + "loss": 0.2186, + "step": 8872 + }, + { + "epoch": 0.141968, + "grad_norm": 0.86328125, + "learning_rate": 8.649677419354839e-05, + "loss": 0.1414, + "step": 8873 + }, + { + "epoch": 0.141984, + "grad_norm": 0.765625, + "learning_rate": 8.649516129032257e-05, + "loss": 0.1255, + "step": 8874 + }, + { + "epoch": 0.142, + "grad_norm": 0.7578125, + "learning_rate": 8.649354838709677e-05, + "loss": 0.1562, + "step": 8875 + }, + { + "epoch": 0.142016, + "grad_norm": 0.6328125, + "learning_rate": 8.649193548387097e-05, + "loss": 0.1657, + "step": 8876 + }, + { + "epoch": 0.142032, + "grad_norm": 1.015625, + "learning_rate": 8.649032258064517e-05, + "loss": 0.1612, + "step": 8877 + }, + { + "epoch": 0.142048, + "grad_norm": 0.6015625, + "learning_rate": 8.648870967741936e-05, + "loss": 0.1502, + "step": 8878 + }, + { + "epoch": 0.142064, + "grad_norm": 0.53125, + "learning_rate": 8.648709677419356e-05, + "loss": 0.1613, + "step": 8879 + }, + { + "epoch": 0.14208, + "grad_norm": 1.4296875, + "learning_rate": 8.648548387096774e-05, + "loss": 0.1633, + "step": 8880 + }, + { + "epoch": 0.142096, + "grad_norm": 1.015625, + "learning_rate": 8.648387096774194e-05, + "loss": 0.2072, + "step": 8881 + }, + { + "epoch": 0.142112, + "grad_norm": 0.75390625, + "learning_rate": 8.648225806451613e-05, + "loss": 0.1796, + "step": 8882 + }, + { + "epoch": 0.142128, + "grad_norm": 1.015625, + "learning_rate": 8.648064516129033e-05, + "loss": 0.1909, + "step": 8883 + }, + { + "epoch": 0.142144, + "grad_norm": 0.59765625, + "learning_rate": 8.647903225806452e-05, + "loss": 0.1859, + "step": 8884 + }, + { + "epoch": 0.14216, + "grad_norm": 0.875, + "learning_rate": 8.647741935483872e-05, + "loss": 0.2056, + "step": 8885 + }, + { + "epoch": 0.142176, + "grad_norm": 0.62109375, + "learning_rate": 8.64758064516129e-05, + "loss": 0.1399, + "step": 8886 + }, + { + "epoch": 0.142192, + "grad_norm": 1.15625, + "learning_rate": 8.64741935483871e-05, + "loss": 0.1957, + "step": 8887 + }, + { + "epoch": 0.142208, + "grad_norm": 0.68359375, + "learning_rate": 8.64725806451613e-05, + "loss": 0.1799, + "step": 8888 + }, + { + "epoch": 0.142224, + "grad_norm": 0.890625, + "learning_rate": 8.647096774193549e-05, + "loss": 0.1752, + "step": 8889 + }, + { + "epoch": 0.14224, + "grad_norm": 0.7578125, + "learning_rate": 8.646935483870969e-05, + "loss": 0.1618, + "step": 8890 + }, + { + "epoch": 0.142256, + "grad_norm": 0.58984375, + "learning_rate": 8.646774193548387e-05, + "loss": 0.1654, + "step": 8891 + }, + { + "epoch": 0.142272, + "grad_norm": 0.6171875, + "learning_rate": 8.646612903225807e-05, + "loss": 0.1563, + "step": 8892 + }, + { + "epoch": 0.142288, + "grad_norm": 1.03125, + "learning_rate": 8.646451612903226e-05, + "loss": 0.2092, + "step": 8893 + }, + { + "epoch": 0.142304, + "grad_norm": 0.9765625, + "learning_rate": 8.646290322580646e-05, + "loss": 0.1697, + "step": 8894 + }, + { + "epoch": 0.14232, + "grad_norm": 0.77734375, + "learning_rate": 8.646129032258064e-05, + "loss": 0.1722, + "step": 8895 + }, + { + "epoch": 0.142336, + "grad_norm": 0.921875, + "learning_rate": 8.645967741935484e-05, + "loss": 0.1943, + "step": 8896 + }, + { + "epoch": 0.142352, + "grad_norm": 0.79296875, + "learning_rate": 8.645806451612903e-05, + "loss": 0.2033, + "step": 8897 + }, + { + "epoch": 0.142368, + "grad_norm": 1.2421875, + "learning_rate": 8.645645161290323e-05, + "loss": 0.1561, + "step": 8898 + }, + { + "epoch": 0.142384, + "grad_norm": 0.953125, + "learning_rate": 8.645483870967742e-05, + "loss": 0.1566, + "step": 8899 + }, + { + "epoch": 0.1424, + "grad_norm": 1.3359375, + "learning_rate": 8.645322580645161e-05, + "loss": 0.1621, + "step": 8900 + }, + { + "epoch": 0.142416, + "grad_norm": 1.1875, + "learning_rate": 8.645161290322581e-05, + "loss": 0.205, + "step": 8901 + }, + { + "epoch": 0.142432, + "grad_norm": 0.76171875, + "learning_rate": 8.645000000000001e-05, + "loss": 0.1788, + "step": 8902 + }, + { + "epoch": 0.142448, + "grad_norm": 1.0546875, + "learning_rate": 8.64483870967742e-05, + "loss": 0.193, + "step": 8903 + }, + { + "epoch": 0.142464, + "grad_norm": 0.93359375, + "learning_rate": 8.64467741935484e-05, + "loss": 0.16, + "step": 8904 + }, + { + "epoch": 0.14248, + "grad_norm": 1.2265625, + "learning_rate": 8.644516129032259e-05, + "loss": 0.197, + "step": 8905 + }, + { + "epoch": 0.142496, + "grad_norm": 1.046875, + "learning_rate": 8.644354838709677e-05, + "loss": 0.1537, + "step": 8906 + }, + { + "epoch": 0.142512, + "grad_norm": 0.671875, + "learning_rate": 8.644193548387097e-05, + "loss": 0.1938, + "step": 8907 + }, + { + "epoch": 0.142528, + "grad_norm": 0.9453125, + "learning_rate": 8.644032258064516e-05, + "loss": 0.1433, + "step": 8908 + }, + { + "epoch": 0.142544, + "grad_norm": 0.9921875, + "learning_rate": 8.643870967741936e-05, + "loss": 0.1741, + "step": 8909 + }, + { + "epoch": 0.14256, + "grad_norm": 0.76171875, + "learning_rate": 8.643709677419354e-05, + "loss": 0.1431, + "step": 8910 + }, + { + "epoch": 0.142576, + "grad_norm": 1.0, + "learning_rate": 8.643548387096774e-05, + "loss": 0.1988, + "step": 8911 + }, + { + "epoch": 0.142592, + "grad_norm": 1.078125, + "learning_rate": 8.643387096774194e-05, + "loss": 0.2238, + "step": 8912 + }, + { + "epoch": 0.142608, + "grad_norm": 0.8828125, + "learning_rate": 8.643225806451614e-05, + "loss": 0.1427, + "step": 8913 + }, + { + "epoch": 0.142624, + "grad_norm": 0.85546875, + "learning_rate": 8.643064516129033e-05, + "loss": 0.1889, + "step": 8914 + }, + { + "epoch": 0.14264, + "grad_norm": 0.55078125, + "learning_rate": 8.642903225806453e-05, + "loss": 0.1624, + "step": 8915 + }, + { + "epoch": 0.142656, + "grad_norm": 0.6640625, + "learning_rate": 8.642741935483871e-05, + "loss": 0.1858, + "step": 8916 + }, + { + "epoch": 0.142672, + "grad_norm": 0.66796875, + "learning_rate": 8.642580645161291e-05, + "loss": 0.178, + "step": 8917 + }, + { + "epoch": 0.142688, + "grad_norm": 0.69921875, + "learning_rate": 8.64241935483871e-05, + "loss": 0.1516, + "step": 8918 + }, + { + "epoch": 0.142704, + "grad_norm": 0.625, + "learning_rate": 8.64225806451613e-05, + "loss": 0.1874, + "step": 8919 + }, + { + "epoch": 0.14272, + "grad_norm": 0.84375, + "learning_rate": 8.642096774193549e-05, + "loss": 0.1666, + "step": 8920 + }, + { + "epoch": 0.142736, + "grad_norm": 0.6171875, + "learning_rate": 8.641935483870967e-05, + "loss": 0.1814, + "step": 8921 + }, + { + "epoch": 0.142752, + "grad_norm": 0.71484375, + "learning_rate": 8.641774193548387e-05, + "loss": 0.2178, + "step": 8922 + }, + { + "epoch": 0.142768, + "grad_norm": 0.80859375, + "learning_rate": 8.641612903225806e-05, + "loss": 0.1708, + "step": 8923 + }, + { + "epoch": 0.142784, + "grad_norm": 0.7578125, + "learning_rate": 8.641451612903226e-05, + "loss": 0.1756, + "step": 8924 + }, + { + "epoch": 0.1428, + "grad_norm": 0.6484375, + "learning_rate": 8.641290322580646e-05, + "loss": 0.151, + "step": 8925 + }, + { + "epoch": 0.142816, + "grad_norm": 1.3125, + "learning_rate": 8.641129032258066e-05, + "loss": 0.1654, + "step": 8926 + }, + { + "epoch": 0.142832, + "grad_norm": 0.90234375, + "learning_rate": 8.640967741935484e-05, + "loss": 0.1768, + "step": 8927 + }, + { + "epoch": 0.142848, + "grad_norm": 0.62109375, + "learning_rate": 8.640806451612904e-05, + "loss": 0.1296, + "step": 8928 + }, + { + "epoch": 0.142864, + "grad_norm": 0.76171875, + "learning_rate": 8.640645161290323e-05, + "loss": 0.1803, + "step": 8929 + }, + { + "epoch": 0.14288, + "grad_norm": 0.9140625, + "learning_rate": 8.640483870967743e-05, + "loss": 0.1826, + "step": 8930 + }, + { + "epoch": 0.142896, + "grad_norm": 0.62109375, + "learning_rate": 8.640322580645161e-05, + "loss": 0.137, + "step": 8931 + }, + { + "epoch": 0.142912, + "grad_norm": 0.6171875, + "learning_rate": 8.640161290322581e-05, + "loss": 0.1884, + "step": 8932 + }, + { + "epoch": 0.142928, + "grad_norm": 0.6015625, + "learning_rate": 8.64e-05, + "loss": 0.1798, + "step": 8933 + }, + { + "epoch": 0.142944, + "grad_norm": 0.734375, + "learning_rate": 8.63983870967742e-05, + "loss": 0.174, + "step": 8934 + }, + { + "epoch": 0.14296, + "grad_norm": 0.62109375, + "learning_rate": 8.639677419354839e-05, + "loss": 0.152, + "step": 8935 + }, + { + "epoch": 0.142976, + "grad_norm": 0.96484375, + "learning_rate": 8.639516129032258e-05, + "loss": 0.1818, + "step": 8936 + }, + { + "epoch": 0.142992, + "grad_norm": 0.62890625, + "learning_rate": 8.639354838709678e-05, + "loss": 0.1653, + "step": 8937 + }, + { + "epoch": 0.143008, + "grad_norm": 0.578125, + "learning_rate": 8.639193548387097e-05, + "loss": 0.1606, + "step": 8938 + }, + { + "epoch": 0.143024, + "grad_norm": 0.8046875, + "learning_rate": 8.639032258064517e-05, + "loss": 0.1349, + "step": 8939 + }, + { + "epoch": 0.14304, + "grad_norm": 0.91015625, + "learning_rate": 8.638870967741936e-05, + "loss": 0.1571, + "step": 8940 + }, + { + "epoch": 0.143056, + "grad_norm": 1.0546875, + "learning_rate": 8.638709677419356e-05, + "loss": 0.1793, + "step": 8941 + }, + { + "epoch": 0.143072, + "grad_norm": 0.91796875, + "learning_rate": 8.638548387096774e-05, + "loss": 0.16, + "step": 8942 + }, + { + "epoch": 0.143088, + "grad_norm": 0.62109375, + "learning_rate": 8.638387096774194e-05, + "loss": 0.1773, + "step": 8943 + }, + { + "epoch": 0.143104, + "grad_norm": 0.72265625, + "learning_rate": 8.638225806451613e-05, + "loss": 0.2113, + "step": 8944 + }, + { + "epoch": 0.14312, + "grad_norm": 0.65234375, + "learning_rate": 8.638064516129033e-05, + "loss": 0.1431, + "step": 8945 + }, + { + "epoch": 0.143136, + "grad_norm": 0.796875, + "learning_rate": 8.637903225806451e-05, + "loss": 0.2174, + "step": 8946 + }, + { + "epoch": 0.143152, + "grad_norm": 0.70703125, + "learning_rate": 8.637741935483871e-05, + "loss": 0.1835, + "step": 8947 + }, + { + "epoch": 0.143168, + "grad_norm": 0.796875, + "learning_rate": 8.637580645161291e-05, + "loss": 0.1792, + "step": 8948 + }, + { + "epoch": 0.143184, + "grad_norm": 0.7890625, + "learning_rate": 8.637419354838711e-05, + "loss": 0.1729, + "step": 8949 + }, + { + "epoch": 0.1432, + "grad_norm": 0.578125, + "learning_rate": 8.63725806451613e-05, + "loss": 0.1306, + "step": 8950 + }, + { + "epoch": 0.143216, + "grad_norm": 0.72265625, + "learning_rate": 8.63709677419355e-05, + "loss": 0.148, + "step": 8951 + }, + { + "epoch": 0.143232, + "grad_norm": 0.859375, + "learning_rate": 8.636935483870968e-05, + "loss": 0.153, + "step": 8952 + }, + { + "epoch": 0.143248, + "grad_norm": 1.03125, + "learning_rate": 8.636774193548387e-05, + "loss": 0.1747, + "step": 8953 + }, + { + "epoch": 0.143264, + "grad_norm": 0.640625, + "learning_rate": 8.636612903225807e-05, + "loss": 0.1747, + "step": 8954 + }, + { + "epoch": 0.14328, + "grad_norm": 1.28125, + "learning_rate": 8.636451612903226e-05, + "loss": 0.1894, + "step": 8955 + }, + { + "epoch": 0.143296, + "grad_norm": 0.7578125, + "learning_rate": 8.636290322580646e-05, + "loss": 0.178, + "step": 8956 + }, + { + "epoch": 0.143312, + "grad_norm": 0.65625, + "learning_rate": 8.636129032258064e-05, + "loss": 0.1436, + "step": 8957 + }, + { + "epoch": 0.143328, + "grad_norm": 1.15625, + "learning_rate": 8.635967741935484e-05, + "loss": 0.1881, + "step": 8958 + }, + { + "epoch": 0.143344, + "grad_norm": 0.86328125, + "learning_rate": 8.635806451612903e-05, + "loss": 0.2209, + "step": 8959 + }, + { + "epoch": 0.14336, + "grad_norm": 0.88671875, + "learning_rate": 8.635645161290323e-05, + "loss": 0.1893, + "step": 8960 + }, + { + "epoch": 0.143376, + "grad_norm": 0.7890625, + "learning_rate": 8.635483870967743e-05, + "loss": 0.2, + "step": 8961 + }, + { + "epoch": 0.143392, + "grad_norm": 1.0078125, + "learning_rate": 8.635322580645163e-05, + "loss": 0.1627, + "step": 8962 + }, + { + "epoch": 0.143408, + "grad_norm": 0.875, + "learning_rate": 8.635161290322581e-05, + "loss": 0.1586, + "step": 8963 + }, + { + "epoch": 0.143424, + "grad_norm": 0.7578125, + "learning_rate": 8.635000000000001e-05, + "loss": 0.1946, + "step": 8964 + }, + { + "epoch": 0.14344, + "grad_norm": 0.90234375, + "learning_rate": 8.63483870967742e-05, + "loss": 0.2041, + "step": 8965 + }, + { + "epoch": 0.143456, + "grad_norm": 1.09375, + "learning_rate": 8.63467741935484e-05, + "loss": 0.1984, + "step": 8966 + }, + { + "epoch": 0.143472, + "grad_norm": 1.0859375, + "learning_rate": 8.634516129032258e-05, + "loss": 0.1885, + "step": 8967 + }, + { + "epoch": 0.143488, + "grad_norm": 1.109375, + "learning_rate": 8.634354838709677e-05, + "loss": 0.1636, + "step": 8968 + }, + { + "epoch": 0.143504, + "grad_norm": 0.76171875, + "learning_rate": 8.634193548387097e-05, + "loss": 0.1936, + "step": 8969 + }, + { + "epoch": 0.14352, + "grad_norm": 0.79296875, + "learning_rate": 8.634032258064516e-05, + "loss": 0.2175, + "step": 8970 + }, + { + "epoch": 0.143536, + "grad_norm": 0.82421875, + "learning_rate": 8.633870967741936e-05, + "loss": 0.1774, + "step": 8971 + }, + { + "epoch": 0.143552, + "grad_norm": 1.078125, + "learning_rate": 8.633709677419355e-05, + "loss": 0.1752, + "step": 8972 + }, + { + "epoch": 0.143568, + "grad_norm": 0.578125, + "learning_rate": 8.633548387096775e-05, + "loss": 0.1561, + "step": 8973 + }, + { + "epoch": 0.143584, + "grad_norm": 0.6484375, + "learning_rate": 8.633387096774194e-05, + "loss": 0.1584, + "step": 8974 + }, + { + "epoch": 0.1436, + "grad_norm": 0.81640625, + "learning_rate": 8.633225806451614e-05, + "loss": 0.1557, + "step": 8975 + }, + { + "epoch": 0.143616, + "grad_norm": 0.65625, + "learning_rate": 8.633064516129033e-05, + "loss": 0.1718, + "step": 8976 + }, + { + "epoch": 0.143632, + "grad_norm": 0.65625, + "learning_rate": 8.632903225806453e-05, + "loss": 0.1434, + "step": 8977 + }, + { + "epoch": 0.143648, + "grad_norm": 0.94140625, + "learning_rate": 8.632741935483871e-05, + "loss": 0.183, + "step": 8978 + }, + { + "epoch": 0.143664, + "grad_norm": 0.92578125, + "learning_rate": 8.632580645161291e-05, + "loss": 0.1621, + "step": 8979 + }, + { + "epoch": 0.14368, + "grad_norm": 0.546875, + "learning_rate": 8.63241935483871e-05, + "loss": 0.1537, + "step": 8980 + }, + { + "epoch": 0.143696, + "grad_norm": 1.09375, + "learning_rate": 8.63225806451613e-05, + "loss": 0.1967, + "step": 8981 + }, + { + "epoch": 0.143712, + "grad_norm": 0.640625, + "learning_rate": 8.632096774193548e-05, + "loss": 0.1575, + "step": 8982 + }, + { + "epoch": 0.143728, + "grad_norm": 0.85546875, + "learning_rate": 8.631935483870968e-05, + "loss": 0.2099, + "step": 8983 + }, + { + "epoch": 0.143744, + "grad_norm": 1.1328125, + "learning_rate": 8.631774193548388e-05, + "loss": 0.1713, + "step": 8984 + }, + { + "epoch": 0.14376, + "grad_norm": 0.875, + "learning_rate": 8.631612903225807e-05, + "loss": 0.1772, + "step": 8985 + }, + { + "epoch": 0.143776, + "grad_norm": 1.171875, + "learning_rate": 8.631451612903227e-05, + "loss": 0.2033, + "step": 8986 + }, + { + "epoch": 0.143792, + "grad_norm": 1.0625, + "learning_rate": 8.631290322580645e-05, + "loss": 0.1491, + "step": 8987 + }, + { + "epoch": 0.143808, + "grad_norm": 0.7734375, + "learning_rate": 8.631129032258065e-05, + "loss": 0.1727, + "step": 8988 + }, + { + "epoch": 0.143824, + "grad_norm": 0.8984375, + "learning_rate": 8.630967741935484e-05, + "loss": 0.1672, + "step": 8989 + }, + { + "epoch": 0.14384, + "grad_norm": 1.546875, + "learning_rate": 8.630806451612904e-05, + "loss": 0.1801, + "step": 8990 + }, + { + "epoch": 0.143856, + "grad_norm": 0.63671875, + "learning_rate": 8.630645161290323e-05, + "loss": 0.1544, + "step": 8991 + }, + { + "epoch": 0.143872, + "grad_norm": 0.9140625, + "learning_rate": 8.630483870967743e-05, + "loss": 0.1616, + "step": 8992 + }, + { + "epoch": 0.143888, + "grad_norm": 1.046875, + "learning_rate": 8.630322580645161e-05, + "loss": 0.1664, + "step": 8993 + }, + { + "epoch": 0.143904, + "grad_norm": 1.0625, + "learning_rate": 8.630161290322581e-05, + "loss": 0.1481, + "step": 8994 + }, + { + "epoch": 0.14392, + "grad_norm": 1.0234375, + "learning_rate": 8.63e-05, + "loss": 0.1587, + "step": 8995 + }, + { + "epoch": 0.143936, + "grad_norm": 1.1171875, + "learning_rate": 8.62983870967742e-05, + "loss": 0.1931, + "step": 8996 + }, + { + "epoch": 0.143952, + "grad_norm": 1.4296875, + "learning_rate": 8.62967741935484e-05, + "loss": 0.2035, + "step": 8997 + }, + { + "epoch": 0.143968, + "grad_norm": 1.015625, + "learning_rate": 8.629516129032258e-05, + "loss": 0.1849, + "step": 8998 + }, + { + "epoch": 0.143984, + "grad_norm": 1.375, + "learning_rate": 8.629354838709678e-05, + "loss": 0.1773, + "step": 8999 + }, + { + "epoch": 0.144, + "grad_norm": 1.0, + "learning_rate": 8.629193548387097e-05, + "loss": 0.1586, + "step": 9000 + }, + { + "epoch": 0.144016, + "grad_norm": 1.28125, + "learning_rate": 8.629032258064517e-05, + "loss": 0.1694, + "step": 9001 + }, + { + "epoch": 0.144032, + "grad_norm": 0.83984375, + "learning_rate": 8.628870967741935e-05, + "loss": 0.1646, + "step": 9002 + }, + { + "epoch": 0.144048, + "grad_norm": 0.828125, + "learning_rate": 8.628709677419355e-05, + "loss": 0.1349, + "step": 9003 + }, + { + "epoch": 0.144064, + "grad_norm": 1.21875, + "learning_rate": 8.628548387096774e-05, + "loss": 0.2061, + "step": 9004 + }, + { + "epoch": 0.14408, + "grad_norm": 0.70703125, + "learning_rate": 8.628387096774194e-05, + "loss": 0.1602, + "step": 9005 + }, + { + "epoch": 0.144096, + "grad_norm": 0.73046875, + "learning_rate": 8.628225806451613e-05, + "loss": 0.1896, + "step": 9006 + }, + { + "epoch": 0.144112, + "grad_norm": 1.1015625, + "learning_rate": 8.628064516129032e-05, + "loss": 0.2073, + "step": 9007 + }, + { + "epoch": 0.144128, + "grad_norm": 0.84375, + "learning_rate": 8.627903225806452e-05, + "loss": 0.1973, + "step": 9008 + }, + { + "epoch": 0.144144, + "grad_norm": 1.4453125, + "learning_rate": 8.627741935483872e-05, + "loss": 0.18, + "step": 9009 + }, + { + "epoch": 0.14416, + "grad_norm": 0.78515625, + "learning_rate": 8.627580645161291e-05, + "loss": 0.1615, + "step": 9010 + }, + { + "epoch": 0.144176, + "grad_norm": 1.28125, + "learning_rate": 8.627419354838711e-05, + "loss": 0.2138, + "step": 9011 + }, + { + "epoch": 0.144192, + "grad_norm": 0.87890625, + "learning_rate": 8.62725806451613e-05, + "loss": 0.1623, + "step": 9012 + }, + { + "epoch": 0.144208, + "grad_norm": 0.72265625, + "learning_rate": 8.62709677419355e-05, + "loss": 0.1743, + "step": 9013 + }, + { + "epoch": 0.144224, + "grad_norm": 0.8359375, + "learning_rate": 8.626935483870968e-05, + "loss": 0.1758, + "step": 9014 + }, + { + "epoch": 0.14424, + "grad_norm": 0.8828125, + "learning_rate": 8.626774193548387e-05, + "loss": 0.1532, + "step": 9015 + }, + { + "epoch": 0.144256, + "grad_norm": 0.71484375, + "learning_rate": 8.626612903225807e-05, + "loss": 0.1658, + "step": 9016 + }, + { + "epoch": 0.144272, + "grad_norm": 0.87109375, + "learning_rate": 8.626451612903225e-05, + "loss": 0.1666, + "step": 9017 + }, + { + "epoch": 0.144288, + "grad_norm": 1.3515625, + "learning_rate": 8.626290322580645e-05, + "loss": 0.1226, + "step": 9018 + }, + { + "epoch": 0.144304, + "grad_norm": 0.8671875, + "learning_rate": 8.626129032258064e-05, + "loss": 0.2015, + "step": 9019 + }, + { + "epoch": 0.14432, + "grad_norm": 0.94921875, + "learning_rate": 8.625967741935484e-05, + "loss": 0.1805, + "step": 9020 + }, + { + "epoch": 0.144336, + "grad_norm": 0.90234375, + "learning_rate": 8.625806451612904e-05, + "loss": 0.1721, + "step": 9021 + }, + { + "epoch": 0.144352, + "grad_norm": 0.8671875, + "learning_rate": 8.625645161290324e-05, + "loss": 0.178, + "step": 9022 + }, + { + "epoch": 0.144368, + "grad_norm": 0.96484375, + "learning_rate": 8.625483870967742e-05, + "loss": 0.1761, + "step": 9023 + }, + { + "epoch": 0.144384, + "grad_norm": 0.984375, + "learning_rate": 8.625322580645162e-05, + "loss": 0.188, + "step": 9024 + }, + { + "epoch": 0.1444, + "grad_norm": 0.7109375, + "learning_rate": 8.625161290322581e-05, + "loss": 0.2036, + "step": 9025 + }, + { + "epoch": 0.144416, + "grad_norm": 1.015625, + "learning_rate": 8.625000000000001e-05, + "loss": 0.1882, + "step": 9026 + }, + { + "epoch": 0.144432, + "grad_norm": 0.51171875, + "learning_rate": 8.62483870967742e-05, + "loss": 0.1685, + "step": 9027 + }, + { + "epoch": 0.144448, + "grad_norm": 0.82421875, + "learning_rate": 8.62467741935484e-05, + "loss": 0.1803, + "step": 9028 + }, + { + "epoch": 0.144464, + "grad_norm": 0.90625, + "learning_rate": 8.624516129032258e-05, + "loss": 0.1597, + "step": 9029 + }, + { + "epoch": 0.14448, + "grad_norm": 0.69921875, + "learning_rate": 8.624354838709677e-05, + "loss": 0.1694, + "step": 9030 + }, + { + "epoch": 0.144496, + "grad_norm": 0.7890625, + "learning_rate": 8.624193548387097e-05, + "loss": 0.222, + "step": 9031 + }, + { + "epoch": 0.144512, + "grad_norm": 0.73046875, + "learning_rate": 8.624032258064517e-05, + "loss": 0.1712, + "step": 9032 + }, + { + "epoch": 0.144528, + "grad_norm": 0.69140625, + "learning_rate": 8.623870967741937e-05, + "loss": 0.1484, + "step": 9033 + }, + { + "epoch": 0.144544, + "grad_norm": 0.859375, + "learning_rate": 8.623709677419355e-05, + "loss": 0.1818, + "step": 9034 + }, + { + "epoch": 0.14456, + "grad_norm": 0.8046875, + "learning_rate": 8.623548387096775e-05, + "loss": 0.1432, + "step": 9035 + }, + { + "epoch": 0.144576, + "grad_norm": 1.6875, + "learning_rate": 8.623387096774194e-05, + "loss": 0.2101, + "step": 9036 + }, + { + "epoch": 0.144592, + "grad_norm": 1.234375, + "learning_rate": 8.623225806451614e-05, + "loss": 0.1907, + "step": 9037 + }, + { + "epoch": 0.144608, + "grad_norm": 0.62890625, + "learning_rate": 8.623064516129032e-05, + "loss": 0.1585, + "step": 9038 + }, + { + "epoch": 0.144624, + "grad_norm": 1.0390625, + "learning_rate": 8.622903225806452e-05, + "loss": 0.2153, + "step": 9039 + }, + { + "epoch": 0.14464, + "grad_norm": 0.71484375, + "learning_rate": 8.622741935483871e-05, + "loss": 0.1721, + "step": 9040 + }, + { + "epoch": 0.144656, + "grad_norm": 0.5625, + "learning_rate": 8.622580645161291e-05, + "loss": 0.1413, + "step": 9041 + }, + { + "epoch": 0.144672, + "grad_norm": 0.59375, + "learning_rate": 8.62241935483871e-05, + "loss": 0.1656, + "step": 9042 + }, + { + "epoch": 0.144688, + "grad_norm": 0.6015625, + "learning_rate": 8.62225806451613e-05, + "loss": 0.1771, + "step": 9043 + }, + { + "epoch": 0.144704, + "grad_norm": 0.7578125, + "learning_rate": 8.62209677419355e-05, + "loss": 0.2089, + "step": 9044 + }, + { + "epoch": 0.14472, + "grad_norm": 0.71875, + "learning_rate": 8.621935483870968e-05, + "loss": 0.2009, + "step": 9045 + }, + { + "epoch": 0.144736, + "grad_norm": 0.96875, + "learning_rate": 8.621774193548388e-05, + "loss": 0.1622, + "step": 9046 + }, + { + "epoch": 0.144752, + "grad_norm": 0.87109375, + "learning_rate": 8.621612903225807e-05, + "loss": 0.1583, + "step": 9047 + }, + { + "epoch": 0.144768, + "grad_norm": 1.0234375, + "learning_rate": 8.621451612903227e-05, + "loss": 0.1679, + "step": 9048 + }, + { + "epoch": 0.144784, + "grad_norm": 0.796875, + "learning_rate": 8.621290322580645e-05, + "loss": 0.1884, + "step": 9049 + }, + { + "epoch": 0.1448, + "grad_norm": 0.6953125, + "learning_rate": 8.621129032258065e-05, + "loss": 0.1519, + "step": 9050 + }, + { + "epoch": 0.144816, + "grad_norm": 1.2890625, + "learning_rate": 8.620967741935484e-05, + "loss": 0.2044, + "step": 9051 + }, + { + "epoch": 0.144832, + "grad_norm": 0.74609375, + "learning_rate": 8.620806451612904e-05, + "loss": 0.1578, + "step": 9052 + }, + { + "epoch": 0.144848, + "grad_norm": 0.71875, + "learning_rate": 8.620645161290322e-05, + "loss": 0.1637, + "step": 9053 + }, + { + "epoch": 0.144864, + "grad_norm": 0.84375, + "learning_rate": 8.620483870967742e-05, + "loss": 0.1847, + "step": 9054 + }, + { + "epoch": 0.14488, + "grad_norm": 1.0546875, + "learning_rate": 8.620322580645161e-05, + "loss": 0.1892, + "step": 9055 + }, + { + "epoch": 0.144896, + "grad_norm": 0.83203125, + "learning_rate": 8.620161290322581e-05, + "loss": 0.1402, + "step": 9056 + }, + { + "epoch": 0.144912, + "grad_norm": 0.9140625, + "learning_rate": 8.620000000000001e-05, + "loss": 0.1851, + "step": 9057 + }, + { + "epoch": 0.144928, + "grad_norm": 0.84375, + "learning_rate": 8.619838709677421e-05, + "loss": 0.1735, + "step": 9058 + }, + { + "epoch": 0.144944, + "grad_norm": 0.890625, + "learning_rate": 8.61967741935484e-05, + "loss": 0.227, + "step": 9059 + }, + { + "epoch": 0.14496, + "grad_norm": 0.69921875, + "learning_rate": 8.61951612903226e-05, + "loss": 0.1632, + "step": 9060 + }, + { + "epoch": 0.144976, + "grad_norm": 0.72265625, + "learning_rate": 8.619354838709678e-05, + "loss": 0.2003, + "step": 9061 + }, + { + "epoch": 0.144992, + "grad_norm": 0.70703125, + "learning_rate": 8.619193548387097e-05, + "loss": 0.1489, + "step": 9062 + }, + { + "epoch": 0.145008, + "grad_norm": 0.73046875, + "learning_rate": 8.619032258064517e-05, + "loss": 0.2041, + "step": 9063 + }, + { + "epoch": 0.145024, + "grad_norm": 0.5859375, + "learning_rate": 8.618870967741935e-05, + "loss": 0.1682, + "step": 9064 + }, + { + "epoch": 0.14504, + "grad_norm": 0.72265625, + "learning_rate": 8.618709677419355e-05, + "loss": 0.1485, + "step": 9065 + }, + { + "epoch": 0.145056, + "grad_norm": 0.546875, + "learning_rate": 8.618548387096774e-05, + "loss": 0.1767, + "step": 9066 + }, + { + "epoch": 0.145072, + "grad_norm": 1.09375, + "learning_rate": 8.618387096774194e-05, + "loss": 0.1837, + "step": 9067 + }, + { + "epoch": 0.145088, + "grad_norm": 0.61328125, + "learning_rate": 8.618225806451614e-05, + "loss": 0.1785, + "step": 9068 + }, + { + "epoch": 0.145104, + "grad_norm": 0.703125, + "learning_rate": 8.618064516129034e-05, + "loss": 0.1501, + "step": 9069 + }, + { + "epoch": 0.14512, + "grad_norm": 0.79296875, + "learning_rate": 8.617903225806452e-05, + "loss": 0.1713, + "step": 9070 + }, + { + "epoch": 0.145136, + "grad_norm": 0.890625, + "learning_rate": 8.617741935483872e-05, + "loss": 0.1923, + "step": 9071 + }, + { + "epoch": 0.145152, + "grad_norm": 0.68359375, + "learning_rate": 8.617580645161291e-05, + "loss": 0.1849, + "step": 9072 + }, + { + "epoch": 0.145168, + "grad_norm": 1.0625, + "learning_rate": 8.617419354838711e-05, + "loss": 0.1631, + "step": 9073 + }, + { + "epoch": 0.145184, + "grad_norm": 0.67578125, + "learning_rate": 8.61725806451613e-05, + "loss": 0.1517, + "step": 9074 + }, + { + "epoch": 0.1452, + "grad_norm": 1.078125, + "learning_rate": 8.617096774193549e-05, + "loss": 0.1913, + "step": 9075 + }, + { + "epoch": 0.145216, + "grad_norm": 0.75390625, + "learning_rate": 8.616935483870968e-05, + "loss": 0.1758, + "step": 9076 + }, + { + "epoch": 0.145232, + "grad_norm": 1.171875, + "learning_rate": 8.616774193548387e-05, + "loss": 0.188, + "step": 9077 + }, + { + "epoch": 0.145248, + "grad_norm": 1.0, + "learning_rate": 8.616612903225806e-05, + "loss": 0.185, + "step": 9078 + }, + { + "epoch": 0.145264, + "grad_norm": 0.97265625, + "learning_rate": 8.616451612903226e-05, + "loss": 0.1797, + "step": 9079 + }, + { + "epoch": 0.14528, + "grad_norm": 0.62890625, + "learning_rate": 8.616290322580645e-05, + "loss": 0.1458, + "step": 9080 + }, + { + "epoch": 0.145296, + "grad_norm": 0.91015625, + "learning_rate": 8.616129032258065e-05, + "loss": 0.2259, + "step": 9081 + }, + { + "epoch": 0.145312, + "grad_norm": 1.296875, + "learning_rate": 8.615967741935485e-05, + "loss": 0.1769, + "step": 9082 + }, + { + "epoch": 0.145328, + "grad_norm": 0.7578125, + "learning_rate": 8.615806451612904e-05, + "loss": 0.1626, + "step": 9083 + }, + { + "epoch": 0.145344, + "grad_norm": 0.79296875, + "learning_rate": 8.615645161290324e-05, + "loss": 0.1945, + "step": 9084 + }, + { + "epoch": 0.14536, + "grad_norm": 0.67578125, + "learning_rate": 8.615483870967742e-05, + "loss": 0.1737, + "step": 9085 + }, + { + "epoch": 0.145376, + "grad_norm": 0.7421875, + "learning_rate": 8.615322580645162e-05, + "loss": 0.1617, + "step": 9086 + }, + { + "epoch": 0.145392, + "grad_norm": 0.80078125, + "learning_rate": 8.615161290322581e-05, + "loss": 0.1469, + "step": 9087 + }, + { + "epoch": 0.145408, + "grad_norm": 0.68359375, + "learning_rate": 8.615000000000001e-05, + "loss": 0.1466, + "step": 9088 + }, + { + "epoch": 0.145424, + "grad_norm": 0.7890625, + "learning_rate": 8.614838709677419e-05, + "loss": 0.1693, + "step": 9089 + }, + { + "epoch": 0.14544, + "grad_norm": 0.94140625, + "learning_rate": 8.614677419354839e-05, + "loss": 0.156, + "step": 9090 + }, + { + "epoch": 0.145456, + "grad_norm": 0.6640625, + "learning_rate": 8.614516129032258e-05, + "loss": 0.1483, + "step": 9091 + }, + { + "epoch": 0.145472, + "grad_norm": 1.1171875, + "learning_rate": 8.614354838709678e-05, + "loss": 0.1953, + "step": 9092 + }, + { + "epoch": 0.145488, + "grad_norm": 0.5859375, + "learning_rate": 8.614193548387098e-05, + "loss": 0.1553, + "step": 9093 + }, + { + "epoch": 0.145504, + "grad_norm": 0.9765625, + "learning_rate": 8.614032258064516e-05, + "loss": 0.17, + "step": 9094 + }, + { + "epoch": 0.14552, + "grad_norm": 0.8203125, + "learning_rate": 8.613870967741936e-05, + "loss": 0.1781, + "step": 9095 + }, + { + "epoch": 0.145536, + "grad_norm": 0.7578125, + "learning_rate": 8.613709677419355e-05, + "loss": 0.176, + "step": 9096 + }, + { + "epoch": 0.145552, + "grad_norm": 0.73828125, + "learning_rate": 8.613548387096775e-05, + "loss": 0.1877, + "step": 9097 + }, + { + "epoch": 0.145568, + "grad_norm": 0.84375, + "learning_rate": 8.613387096774194e-05, + "loss": 0.1797, + "step": 9098 + }, + { + "epoch": 0.145584, + "grad_norm": 0.703125, + "learning_rate": 8.613225806451614e-05, + "loss": 0.1686, + "step": 9099 + }, + { + "epoch": 0.1456, + "grad_norm": 0.65625, + "learning_rate": 8.613064516129032e-05, + "loss": 0.1359, + "step": 9100 + }, + { + "epoch": 0.145616, + "grad_norm": 0.5625, + "learning_rate": 8.612903225806452e-05, + "loss": 0.1479, + "step": 9101 + }, + { + "epoch": 0.145632, + "grad_norm": 1.15625, + "learning_rate": 8.612741935483871e-05, + "loss": 0.1775, + "step": 9102 + }, + { + "epoch": 0.145648, + "grad_norm": 1.078125, + "learning_rate": 8.61258064516129e-05, + "loss": 0.1886, + "step": 9103 + }, + { + "epoch": 0.145664, + "grad_norm": 0.79296875, + "learning_rate": 8.61241935483871e-05, + "loss": 0.1499, + "step": 9104 + }, + { + "epoch": 0.14568, + "grad_norm": 0.54296875, + "learning_rate": 8.61225806451613e-05, + "loss": 0.1142, + "step": 9105 + }, + { + "epoch": 0.145696, + "grad_norm": 0.984375, + "learning_rate": 8.612096774193549e-05, + "loss": 0.1547, + "step": 9106 + }, + { + "epoch": 0.145712, + "grad_norm": 0.546875, + "learning_rate": 8.611935483870968e-05, + "loss": 0.1607, + "step": 9107 + }, + { + "epoch": 0.145728, + "grad_norm": 0.703125, + "learning_rate": 8.611774193548388e-05, + "loss": 0.1358, + "step": 9108 + }, + { + "epoch": 0.145744, + "grad_norm": 0.6953125, + "learning_rate": 8.611612903225806e-05, + "loss": 0.144, + "step": 9109 + }, + { + "epoch": 0.14576, + "grad_norm": 0.78125, + "learning_rate": 8.611451612903226e-05, + "loss": 0.1743, + "step": 9110 + }, + { + "epoch": 0.145776, + "grad_norm": 0.984375, + "learning_rate": 8.611290322580645e-05, + "loss": 0.2062, + "step": 9111 + }, + { + "epoch": 0.145792, + "grad_norm": 0.478515625, + "learning_rate": 8.611129032258065e-05, + "loss": 0.1362, + "step": 9112 + }, + { + "epoch": 0.145808, + "grad_norm": 0.71875, + "learning_rate": 8.610967741935484e-05, + "loss": 0.2084, + "step": 9113 + }, + { + "epoch": 0.145824, + "grad_norm": 0.5625, + "learning_rate": 8.610806451612903e-05, + "loss": 0.1387, + "step": 9114 + }, + { + "epoch": 0.14584, + "grad_norm": 0.63671875, + "learning_rate": 8.610645161290322e-05, + "loss": 0.1736, + "step": 9115 + }, + { + "epoch": 0.145856, + "grad_norm": 0.76171875, + "learning_rate": 8.610483870967742e-05, + "loss": 0.1942, + "step": 9116 + }, + { + "epoch": 0.145872, + "grad_norm": 0.70703125, + "learning_rate": 8.610322580645162e-05, + "loss": 0.1565, + "step": 9117 + }, + { + "epoch": 0.145888, + "grad_norm": 0.74609375, + "learning_rate": 8.610161290322582e-05, + "loss": 0.1864, + "step": 9118 + }, + { + "epoch": 0.145904, + "grad_norm": 0.6171875, + "learning_rate": 8.61e-05, + "loss": 0.1698, + "step": 9119 + }, + { + "epoch": 0.14592, + "grad_norm": 0.64453125, + "learning_rate": 8.60983870967742e-05, + "loss": 0.1601, + "step": 9120 + }, + { + "epoch": 0.145936, + "grad_norm": 0.86328125, + "learning_rate": 8.609677419354839e-05, + "loss": 0.1421, + "step": 9121 + }, + { + "epoch": 0.145952, + "grad_norm": 0.56640625, + "learning_rate": 8.609516129032259e-05, + "loss": 0.1665, + "step": 9122 + }, + { + "epoch": 0.145968, + "grad_norm": 0.75390625, + "learning_rate": 8.609354838709678e-05, + "loss": 0.1321, + "step": 9123 + }, + { + "epoch": 0.145984, + "grad_norm": 0.99609375, + "learning_rate": 8.609193548387096e-05, + "loss": 0.1826, + "step": 9124 + }, + { + "epoch": 0.146, + "grad_norm": 0.79296875, + "learning_rate": 8.609032258064516e-05, + "loss": 0.1718, + "step": 9125 + }, + { + "epoch": 0.146016, + "grad_norm": 0.56640625, + "learning_rate": 8.608870967741935e-05, + "loss": 0.1622, + "step": 9126 + }, + { + "epoch": 0.146032, + "grad_norm": 0.95703125, + "learning_rate": 8.608709677419355e-05, + "loss": 0.1679, + "step": 9127 + }, + { + "epoch": 0.146048, + "grad_norm": 0.8203125, + "learning_rate": 8.608548387096775e-05, + "loss": 0.1671, + "step": 9128 + }, + { + "epoch": 0.146064, + "grad_norm": 1.46875, + "learning_rate": 8.608387096774195e-05, + "loss": 0.1829, + "step": 9129 + }, + { + "epoch": 0.14608, + "grad_norm": 0.5390625, + "learning_rate": 8.608225806451613e-05, + "loss": 0.1739, + "step": 9130 + }, + { + "epoch": 0.146096, + "grad_norm": 1.015625, + "learning_rate": 8.608064516129033e-05, + "loss": 0.158, + "step": 9131 + }, + { + "epoch": 0.146112, + "grad_norm": 0.76953125, + "learning_rate": 8.607903225806452e-05, + "loss": 0.1863, + "step": 9132 + }, + { + "epoch": 0.146128, + "grad_norm": 1.265625, + "learning_rate": 8.607741935483872e-05, + "loss": 0.1724, + "step": 9133 + }, + { + "epoch": 0.146144, + "grad_norm": 0.8046875, + "learning_rate": 8.60758064516129e-05, + "loss": 0.1879, + "step": 9134 + }, + { + "epoch": 0.14616, + "grad_norm": 0.75390625, + "learning_rate": 8.60741935483871e-05, + "loss": 0.189, + "step": 9135 + }, + { + "epoch": 0.146176, + "grad_norm": 0.98828125, + "learning_rate": 8.607258064516129e-05, + "loss": 0.1519, + "step": 9136 + }, + { + "epoch": 0.146192, + "grad_norm": 0.62890625, + "learning_rate": 8.607096774193549e-05, + "loss": 0.1601, + "step": 9137 + }, + { + "epoch": 0.146208, + "grad_norm": 0.8125, + "learning_rate": 8.606935483870968e-05, + "loss": 0.1603, + "step": 9138 + }, + { + "epoch": 0.146224, + "grad_norm": 1.0078125, + "learning_rate": 8.606774193548388e-05, + "loss": 0.2053, + "step": 9139 + }, + { + "epoch": 0.14624, + "grad_norm": 0.5703125, + "learning_rate": 8.606612903225808e-05, + "loss": 0.1391, + "step": 9140 + }, + { + "epoch": 0.146256, + "grad_norm": 0.796875, + "learning_rate": 8.606451612903226e-05, + "loss": 0.1749, + "step": 9141 + }, + { + "epoch": 0.146272, + "grad_norm": 0.86328125, + "learning_rate": 8.606290322580646e-05, + "loss": 0.1781, + "step": 9142 + }, + { + "epoch": 0.146288, + "grad_norm": 0.8828125, + "learning_rate": 8.606129032258065e-05, + "loss": 0.1997, + "step": 9143 + }, + { + "epoch": 0.146304, + "grad_norm": 0.55859375, + "learning_rate": 8.605967741935485e-05, + "loss": 0.1769, + "step": 9144 + }, + { + "epoch": 0.14632, + "grad_norm": 0.498046875, + "learning_rate": 8.605806451612903e-05, + "loss": 0.1495, + "step": 9145 + }, + { + "epoch": 0.146336, + "grad_norm": 1.4296875, + "learning_rate": 8.605645161290323e-05, + "loss": 0.2052, + "step": 9146 + }, + { + "epoch": 0.146352, + "grad_norm": 0.7890625, + "learning_rate": 8.605483870967742e-05, + "loss": 0.1731, + "step": 9147 + }, + { + "epoch": 0.146368, + "grad_norm": 0.765625, + "learning_rate": 8.605322580645162e-05, + "loss": 0.1395, + "step": 9148 + }, + { + "epoch": 0.146384, + "grad_norm": 0.65625, + "learning_rate": 8.60516129032258e-05, + "loss": 0.1724, + "step": 9149 + }, + { + "epoch": 0.1464, + "grad_norm": 0.859375, + "learning_rate": 8.605e-05, + "loss": 0.1602, + "step": 9150 + }, + { + "epoch": 0.146416, + "grad_norm": 0.65625, + "learning_rate": 8.604838709677419e-05, + "loss": 0.1528, + "step": 9151 + }, + { + "epoch": 0.146432, + "grad_norm": 0.60546875, + "learning_rate": 8.604677419354839e-05, + "loss": 0.1521, + "step": 9152 + }, + { + "epoch": 0.146448, + "grad_norm": 0.75390625, + "learning_rate": 8.604516129032259e-05, + "loss": 0.155, + "step": 9153 + }, + { + "epoch": 0.146464, + "grad_norm": 0.59765625, + "learning_rate": 8.604354838709678e-05, + "loss": 0.1971, + "step": 9154 + }, + { + "epoch": 0.14648, + "grad_norm": 1.046875, + "learning_rate": 8.604193548387098e-05, + "loss": 0.1632, + "step": 9155 + }, + { + "epoch": 0.146496, + "grad_norm": 1.265625, + "learning_rate": 8.604032258064516e-05, + "loss": 0.1798, + "step": 9156 + }, + { + "epoch": 0.146512, + "grad_norm": 0.93359375, + "learning_rate": 8.603870967741936e-05, + "loss": 0.1858, + "step": 9157 + }, + { + "epoch": 0.146528, + "grad_norm": 1.359375, + "learning_rate": 8.603709677419355e-05, + "loss": 0.2101, + "step": 9158 + }, + { + "epoch": 0.146544, + "grad_norm": 0.7421875, + "learning_rate": 8.603548387096775e-05, + "loss": 0.1515, + "step": 9159 + }, + { + "epoch": 0.14656, + "grad_norm": 0.81640625, + "learning_rate": 8.603387096774193e-05, + "loss": 0.1952, + "step": 9160 + }, + { + "epoch": 0.146576, + "grad_norm": 0.91015625, + "learning_rate": 8.603225806451613e-05, + "loss": 0.228, + "step": 9161 + }, + { + "epoch": 0.146592, + "grad_norm": 0.71484375, + "learning_rate": 8.603064516129032e-05, + "loss": 0.1566, + "step": 9162 + }, + { + "epoch": 0.146608, + "grad_norm": 0.7265625, + "learning_rate": 8.602903225806452e-05, + "loss": 0.1369, + "step": 9163 + }, + { + "epoch": 0.146624, + "grad_norm": 0.890625, + "learning_rate": 8.602741935483872e-05, + "loss": 0.216, + "step": 9164 + }, + { + "epoch": 0.14664, + "grad_norm": 0.60546875, + "learning_rate": 8.602580645161292e-05, + "loss": 0.1753, + "step": 9165 + }, + { + "epoch": 0.146656, + "grad_norm": 0.73046875, + "learning_rate": 8.60241935483871e-05, + "loss": 0.1843, + "step": 9166 + }, + { + "epoch": 0.146672, + "grad_norm": 0.8671875, + "learning_rate": 8.60225806451613e-05, + "loss": 0.1779, + "step": 9167 + }, + { + "epoch": 0.146688, + "grad_norm": 0.94140625, + "learning_rate": 8.602096774193549e-05, + "loss": 0.1569, + "step": 9168 + }, + { + "epoch": 0.146704, + "grad_norm": 0.69140625, + "learning_rate": 8.601935483870969e-05, + "loss": 0.2055, + "step": 9169 + }, + { + "epoch": 0.14672, + "grad_norm": 0.68359375, + "learning_rate": 8.601774193548388e-05, + "loss": 0.1594, + "step": 9170 + }, + { + "epoch": 0.146736, + "grad_norm": 0.79296875, + "learning_rate": 8.601612903225806e-05, + "loss": 0.1537, + "step": 9171 + }, + { + "epoch": 0.146752, + "grad_norm": 0.96875, + "learning_rate": 8.601451612903226e-05, + "loss": 0.2112, + "step": 9172 + }, + { + "epoch": 0.146768, + "grad_norm": 1.1328125, + "learning_rate": 8.601290322580645e-05, + "loss": 0.1791, + "step": 9173 + }, + { + "epoch": 0.146784, + "grad_norm": 0.6015625, + "learning_rate": 8.601129032258065e-05, + "loss": 0.1534, + "step": 9174 + }, + { + "epoch": 0.1468, + "grad_norm": 0.7890625, + "learning_rate": 8.600967741935483e-05, + "loss": 0.1721, + "step": 9175 + }, + { + "epoch": 0.146816, + "grad_norm": 1.0546875, + "learning_rate": 8.600806451612903e-05, + "loss": 0.2203, + "step": 9176 + }, + { + "epoch": 0.146832, + "grad_norm": 0.4296875, + "learning_rate": 8.600645161290323e-05, + "loss": 0.1611, + "step": 9177 + }, + { + "epoch": 0.146848, + "grad_norm": 0.8046875, + "learning_rate": 8.600483870967743e-05, + "loss": 0.141, + "step": 9178 + }, + { + "epoch": 0.146864, + "grad_norm": 0.734375, + "learning_rate": 8.600322580645162e-05, + "loss": 0.188, + "step": 9179 + }, + { + "epoch": 0.14688, + "grad_norm": 0.77734375, + "learning_rate": 8.600161290322582e-05, + "loss": 0.1568, + "step": 9180 + }, + { + "epoch": 0.146896, + "grad_norm": 1.2734375, + "learning_rate": 8.6e-05, + "loss": 0.2025, + "step": 9181 + }, + { + "epoch": 0.146912, + "grad_norm": 0.734375, + "learning_rate": 8.59983870967742e-05, + "loss": 0.1848, + "step": 9182 + }, + { + "epoch": 0.146928, + "grad_norm": 0.67578125, + "learning_rate": 8.599677419354839e-05, + "loss": 0.1272, + "step": 9183 + }, + { + "epoch": 0.146944, + "grad_norm": 0.8359375, + "learning_rate": 8.599516129032259e-05, + "loss": 0.1745, + "step": 9184 + }, + { + "epoch": 0.14696, + "grad_norm": 1.015625, + "learning_rate": 8.599354838709677e-05, + "loss": 0.2325, + "step": 9185 + }, + { + "epoch": 0.146976, + "grad_norm": 0.7578125, + "learning_rate": 8.599193548387096e-05, + "loss": 0.1435, + "step": 9186 + }, + { + "epoch": 0.146992, + "grad_norm": 0.84765625, + "learning_rate": 8.599032258064516e-05, + "loss": 0.1596, + "step": 9187 + }, + { + "epoch": 0.147008, + "grad_norm": 0.6328125, + "learning_rate": 8.598870967741936e-05, + "loss": 0.1722, + "step": 9188 + }, + { + "epoch": 0.147024, + "grad_norm": 0.8125, + "learning_rate": 8.598709677419356e-05, + "loss": 0.1948, + "step": 9189 + }, + { + "epoch": 0.14704, + "grad_norm": 0.66796875, + "learning_rate": 8.598548387096775e-05, + "loss": 0.1498, + "step": 9190 + }, + { + "epoch": 0.147056, + "grad_norm": 1.4140625, + "learning_rate": 8.598387096774195e-05, + "loss": 0.1913, + "step": 9191 + }, + { + "epoch": 0.147072, + "grad_norm": 1.03125, + "learning_rate": 8.598225806451613e-05, + "loss": 0.1951, + "step": 9192 + }, + { + "epoch": 0.147088, + "grad_norm": 0.9453125, + "learning_rate": 8.598064516129033e-05, + "loss": 0.179, + "step": 9193 + }, + { + "epoch": 0.147104, + "grad_norm": 0.73828125, + "learning_rate": 8.597903225806452e-05, + "loss": 0.1499, + "step": 9194 + }, + { + "epoch": 0.14712, + "grad_norm": 0.71875, + "learning_rate": 8.597741935483872e-05, + "loss": 0.1946, + "step": 9195 + }, + { + "epoch": 0.147136, + "grad_norm": 0.890625, + "learning_rate": 8.59758064516129e-05, + "loss": 0.1702, + "step": 9196 + }, + { + "epoch": 0.147152, + "grad_norm": 0.66015625, + "learning_rate": 8.59741935483871e-05, + "loss": 0.1762, + "step": 9197 + }, + { + "epoch": 0.147168, + "grad_norm": 0.66015625, + "learning_rate": 8.597258064516129e-05, + "loss": 0.1879, + "step": 9198 + }, + { + "epoch": 0.147184, + "grad_norm": 0.75, + "learning_rate": 8.597096774193549e-05, + "loss": 0.1859, + "step": 9199 + }, + { + "epoch": 0.1472, + "grad_norm": 0.59375, + "learning_rate": 8.596935483870969e-05, + "loss": 0.1545, + "step": 9200 + }, + { + "epoch": 0.147216, + "grad_norm": 1.0546875, + "learning_rate": 8.596774193548387e-05, + "loss": 0.1574, + "step": 9201 + }, + { + "epoch": 0.147232, + "grad_norm": 0.73046875, + "learning_rate": 8.596612903225807e-05, + "loss": 0.1702, + "step": 9202 + }, + { + "epoch": 0.147248, + "grad_norm": 1.0859375, + "learning_rate": 8.596451612903226e-05, + "loss": 0.1675, + "step": 9203 + }, + { + "epoch": 0.147264, + "grad_norm": 0.78125, + "learning_rate": 8.596290322580646e-05, + "loss": 0.2006, + "step": 9204 + }, + { + "epoch": 0.14728, + "grad_norm": 0.5859375, + "learning_rate": 8.596129032258065e-05, + "loss": 0.1428, + "step": 9205 + }, + { + "epoch": 0.147296, + "grad_norm": 1.03125, + "learning_rate": 8.595967741935484e-05, + "loss": 0.2267, + "step": 9206 + }, + { + "epoch": 0.147312, + "grad_norm": 0.8125, + "learning_rate": 8.595806451612903e-05, + "loss": 0.1549, + "step": 9207 + }, + { + "epoch": 0.147328, + "grad_norm": 0.578125, + "learning_rate": 8.595645161290323e-05, + "loss": 0.1386, + "step": 9208 + }, + { + "epoch": 0.147344, + "grad_norm": 0.6796875, + "learning_rate": 8.595483870967742e-05, + "loss": 0.1782, + "step": 9209 + }, + { + "epoch": 0.14736, + "grad_norm": 0.75, + "learning_rate": 8.595322580645162e-05, + "loss": 0.2008, + "step": 9210 + }, + { + "epoch": 0.147376, + "grad_norm": 1.046875, + "learning_rate": 8.59516129032258e-05, + "loss": 0.1717, + "step": 9211 + }, + { + "epoch": 0.147392, + "grad_norm": 0.75, + "learning_rate": 8.595e-05, + "loss": 0.1618, + "step": 9212 + }, + { + "epoch": 0.147408, + "grad_norm": 0.83984375, + "learning_rate": 8.59483870967742e-05, + "loss": 0.2014, + "step": 9213 + }, + { + "epoch": 0.147424, + "grad_norm": 0.609375, + "learning_rate": 8.59467741935484e-05, + "loss": 0.1631, + "step": 9214 + }, + { + "epoch": 0.14744, + "grad_norm": 1.0234375, + "learning_rate": 8.594516129032259e-05, + "loss": 0.1467, + "step": 9215 + }, + { + "epoch": 0.147456, + "grad_norm": 0.77734375, + "learning_rate": 8.594354838709677e-05, + "loss": 0.1764, + "step": 9216 + }, + { + "epoch": 0.147472, + "grad_norm": 0.80078125, + "learning_rate": 8.594193548387097e-05, + "loss": 0.1736, + "step": 9217 + }, + { + "epoch": 0.147488, + "grad_norm": 0.69140625, + "learning_rate": 8.594032258064516e-05, + "loss": 0.1858, + "step": 9218 + }, + { + "epoch": 0.147504, + "grad_norm": 0.79296875, + "learning_rate": 8.593870967741936e-05, + "loss": 0.1631, + "step": 9219 + }, + { + "epoch": 0.14752, + "grad_norm": 1.1640625, + "learning_rate": 8.593709677419354e-05, + "loss": 0.2124, + "step": 9220 + }, + { + "epoch": 0.147536, + "grad_norm": 0.9609375, + "learning_rate": 8.593548387096774e-05, + "loss": 0.2131, + "step": 9221 + }, + { + "epoch": 0.147552, + "grad_norm": 0.59765625, + "learning_rate": 8.593387096774193e-05, + "loss": 0.1474, + "step": 9222 + }, + { + "epoch": 0.147568, + "grad_norm": 0.65625, + "learning_rate": 8.593225806451613e-05, + "loss": 0.1725, + "step": 9223 + }, + { + "epoch": 0.147584, + "grad_norm": 0.83984375, + "learning_rate": 8.593064516129033e-05, + "loss": 0.2053, + "step": 9224 + }, + { + "epoch": 0.1476, + "grad_norm": 0.6875, + "learning_rate": 8.592903225806453e-05, + "loss": 0.1725, + "step": 9225 + }, + { + "epoch": 0.147616, + "grad_norm": 0.796875, + "learning_rate": 8.592741935483872e-05, + "loss": 0.1889, + "step": 9226 + }, + { + "epoch": 0.147632, + "grad_norm": 0.8359375, + "learning_rate": 8.592580645161292e-05, + "loss": 0.1908, + "step": 9227 + }, + { + "epoch": 0.147648, + "grad_norm": 0.7421875, + "learning_rate": 8.59241935483871e-05, + "loss": 0.1545, + "step": 9228 + }, + { + "epoch": 0.147664, + "grad_norm": 0.8828125, + "learning_rate": 8.59225806451613e-05, + "loss": 0.1761, + "step": 9229 + }, + { + "epoch": 0.14768, + "grad_norm": 1.015625, + "learning_rate": 8.592096774193549e-05, + "loss": 0.1808, + "step": 9230 + }, + { + "epoch": 0.147696, + "grad_norm": 0.66015625, + "learning_rate": 8.591935483870969e-05, + "loss": 0.1533, + "step": 9231 + }, + { + "epoch": 0.147712, + "grad_norm": 0.81640625, + "learning_rate": 8.591774193548387e-05, + "loss": 0.1519, + "step": 9232 + }, + { + "epoch": 0.147728, + "grad_norm": 0.8359375, + "learning_rate": 8.591612903225806e-05, + "loss": 0.2157, + "step": 9233 + }, + { + "epoch": 0.147744, + "grad_norm": 0.5078125, + "learning_rate": 8.591451612903226e-05, + "loss": 0.1692, + "step": 9234 + }, + { + "epoch": 0.14776, + "grad_norm": 0.71484375, + "learning_rate": 8.591290322580646e-05, + "loss": 0.1762, + "step": 9235 + }, + { + "epoch": 0.147776, + "grad_norm": 0.61328125, + "learning_rate": 8.591129032258066e-05, + "loss": 0.1669, + "step": 9236 + }, + { + "epoch": 0.147792, + "grad_norm": 0.76953125, + "learning_rate": 8.590967741935484e-05, + "loss": 0.1785, + "step": 9237 + }, + { + "epoch": 0.147808, + "grad_norm": 0.59765625, + "learning_rate": 8.590806451612904e-05, + "loss": 0.148, + "step": 9238 + }, + { + "epoch": 0.147824, + "grad_norm": 1.0859375, + "learning_rate": 8.590645161290323e-05, + "loss": 0.175, + "step": 9239 + }, + { + "epoch": 0.14784, + "grad_norm": 0.765625, + "learning_rate": 8.590483870967743e-05, + "loss": 0.179, + "step": 9240 + }, + { + "epoch": 0.147856, + "grad_norm": 0.5703125, + "learning_rate": 8.590322580645162e-05, + "loss": 0.1422, + "step": 9241 + }, + { + "epoch": 0.147872, + "grad_norm": 0.89453125, + "learning_rate": 8.590161290322581e-05, + "loss": 0.1632, + "step": 9242 + }, + { + "epoch": 0.147888, + "grad_norm": 0.67578125, + "learning_rate": 8.59e-05, + "loss": 0.1636, + "step": 9243 + }, + { + "epoch": 0.147904, + "grad_norm": 0.90625, + "learning_rate": 8.58983870967742e-05, + "loss": 0.1696, + "step": 9244 + }, + { + "epoch": 0.14792, + "grad_norm": 0.59765625, + "learning_rate": 8.589677419354839e-05, + "loss": 0.1954, + "step": 9245 + }, + { + "epoch": 0.147936, + "grad_norm": 0.75, + "learning_rate": 8.589516129032259e-05, + "loss": 0.1825, + "step": 9246 + }, + { + "epoch": 0.147952, + "grad_norm": 0.78515625, + "learning_rate": 8.589354838709677e-05, + "loss": 0.2036, + "step": 9247 + }, + { + "epoch": 0.147968, + "grad_norm": 1.0703125, + "learning_rate": 8.589193548387097e-05, + "loss": 0.1479, + "step": 9248 + }, + { + "epoch": 0.147984, + "grad_norm": 0.8203125, + "learning_rate": 8.589032258064517e-05, + "loss": 0.2114, + "step": 9249 + }, + { + "epoch": 0.148, + "grad_norm": 0.85546875, + "learning_rate": 8.588870967741936e-05, + "loss": 0.1842, + "step": 9250 + }, + { + "epoch": 0.148016, + "grad_norm": 0.78515625, + "learning_rate": 8.588709677419356e-05, + "loss": 0.149, + "step": 9251 + }, + { + "epoch": 0.148032, + "grad_norm": 0.6875, + "learning_rate": 8.588548387096774e-05, + "loss": 0.1695, + "step": 9252 + }, + { + "epoch": 0.148048, + "grad_norm": 0.79296875, + "learning_rate": 8.588387096774194e-05, + "loss": 0.188, + "step": 9253 + }, + { + "epoch": 0.148064, + "grad_norm": 0.703125, + "learning_rate": 8.588225806451613e-05, + "loss": 0.2128, + "step": 9254 + }, + { + "epoch": 0.14808, + "grad_norm": 0.66015625, + "learning_rate": 8.588064516129033e-05, + "loss": 0.1754, + "step": 9255 + }, + { + "epoch": 0.148096, + "grad_norm": 0.95703125, + "learning_rate": 8.587903225806451e-05, + "loss": 0.1735, + "step": 9256 + }, + { + "epoch": 0.148112, + "grad_norm": 0.8359375, + "learning_rate": 8.587741935483871e-05, + "loss": 0.19, + "step": 9257 + }, + { + "epoch": 0.148128, + "grad_norm": 0.890625, + "learning_rate": 8.58758064516129e-05, + "loss": 0.1646, + "step": 9258 + }, + { + "epoch": 0.148144, + "grad_norm": 1.0625, + "learning_rate": 8.58741935483871e-05, + "loss": 0.2085, + "step": 9259 + }, + { + "epoch": 0.14816, + "grad_norm": 1.4375, + "learning_rate": 8.58725806451613e-05, + "loss": 0.2251, + "step": 9260 + }, + { + "epoch": 0.148176, + "grad_norm": 0.703125, + "learning_rate": 8.58709677419355e-05, + "loss": 0.1859, + "step": 9261 + }, + { + "epoch": 0.148192, + "grad_norm": 0.625, + "learning_rate": 8.586935483870969e-05, + "loss": 0.1796, + "step": 9262 + }, + { + "epoch": 0.148208, + "grad_norm": 0.62890625, + "learning_rate": 8.586774193548387e-05, + "loss": 0.1844, + "step": 9263 + }, + { + "epoch": 0.148224, + "grad_norm": 0.6171875, + "learning_rate": 8.586612903225807e-05, + "loss": 0.1322, + "step": 9264 + }, + { + "epoch": 0.14824, + "grad_norm": 1.21875, + "learning_rate": 8.586451612903226e-05, + "loss": 0.1845, + "step": 9265 + }, + { + "epoch": 0.148256, + "grad_norm": 0.609375, + "learning_rate": 8.586290322580646e-05, + "loss": 0.1547, + "step": 9266 + }, + { + "epoch": 0.148272, + "grad_norm": 0.85546875, + "learning_rate": 8.586129032258064e-05, + "loss": 0.2331, + "step": 9267 + }, + { + "epoch": 0.148288, + "grad_norm": 0.8984375, + "learning_rate": 8.585967741935484e-05, + "loss": 0.1537, + "step": 9268 + }, + { + "epoch": 0.148304, + "grad_norm": 0.68359375, + "learning_rate": 8.585806451612903e-05, + "loss": 0.1745, + "step": 9269 + }, + { + "epoch": 0.14832, + "grad_norm": 0.93359375, + "learning_rate": 8.585645161290323e-05, + "loss": 0.2005, + "step": 9270 + }, + { + "epoch": 0.148336, + "grad_norm": 0.73828125, + "learning_rate": 8.585483870967741e-05, + "loss": 0.1709, + "step": 9271 + }, + { + "epoch": 0.148352, + "grad_norm": 1.0078125, + "learning_rate": 8.585322580645161e-05, + "loss": 0.1945, + "step": 9272 + }, + { + "epoch": 0.148368, + "grad_norm": 0.76171875, + "learning_rate": 8.585161290322581e-05, + "loss": 0.1848, + "step": 9273 + }, + { + "epoch": 0.148384, + "grad_norm": 0.671875, + "learning_rate": 8.585000000000001e-05, + "loss": 0.2025, + "step": 9274 + }, + { + "epoch": 0.1484, + "grad_norm": 0.890625, + "learning_rate": 8.58483870967742e-05, + "loss": 0.2376, + "step": 9275 + }, + { + "epoch": 0.148416, + "grad_norm": 0.76171875, + "learning_rate": 8.58467741935484e-05, + "loss": 0.1517, + "step": 9276 + }, + { + "epoch": 0.148432, + "grad_norm": 1.2734375, + "learning_rate": 8.584516129032258e-05, + "loss": 0.1713, + "step": 9277 + }, + { + "epoch": 0.148448, + "grad_norm": 0.796875, + "learning_rate": 8.584354838709678e-05, + "loss": 0.1982, + "step": 9278 + }, + { + "epoch": 0.148464, + "grad_norm": 0.890625, + "learning_rate": 8.584193548387097e-05, + "loss": 0.2247, + "step": 9279 + }, + { + "epoch": 0.14848, + "grad_norm": 0.98828125, + "learning_rate": 8.584032258064516e-05, + "loss": 0.1365, + "step": 9280 + }, + { + "epoch": 0.148496, + "grad_norm": 0.88671875, + "learning_rate": 8.583870967741936e-05, + "loss": 0.2145, + "step": 9281 + }, + { + "epoch": 0.148512, + "grad_norm": 1.453125, + "learning_rate": 8.583709677419354e-05, + "loss": 0.1476, + "step": 9282 + }, + { + "epoch": 0.148528, + "grad_norm": 0.515625, + "learning_rate": 8.583548387096774e-05, + "loss": 0.1383, + "step": 9283 + }, + { + "epoch": 0.148544, + "grad_norm": 1.0859375, + "learning_rate": 8.583387096774194e-05, + "loss": 0.2236, + "step": 9284 + }, + { + "epoch": 0.14856, + "grad_norm": 1.046875, + "learning_rate": 8.583225806451614e-05, + "loss": 0.1618, + "step": 9285 + }, + { + "epoch": 0.148576, + "grad_norm": 0.69921875, + "learning_rate": 8.583064516129033e-05, + "loss": 0.1644, + "step": 9286 + }, + { + "epoch": 0.148592, + "grad_norm": 0.7734375, + "learning_rate": 8.582903225806453e-05, + "loss": 0.1672, + "step": 9287 + }, + { + "epoch": 0.148608, + "grad_norm": 0.84765625, + "learning_rate": 8.582741935483871e-05, + "loss": 0.1854, + "step": 9288 + }, + { + "epoch": 0.148624, + "grad_norm": 1.3203125, + "learning_rate": 8.582580645161291e-05, + "loss": 0.174, + "step": 9289 + }, + { + "epoch": 0.14864, + "grad_norm": 0.515625, + "learning_rate": 8.58241935483871e-05, + "loss": 0.1692, + "step": 9290 + }, + { + "epoch": 0.148656, + "grad_norm": 0.73828125, + "learning_rate": 8.58225806451613e-05, + "loss": 0.1433, + "step": 9291 + }, + { + "epoch": 0.148672, + "grad_norm": 1.1328125, + "learning_rate": 8.582096774193548e-05, + "loss": 0.2088, + "step": 9292 + }, + { + "epoch": 0.148688, + "grad_norm": 0.74609375, + "learning_rate": 8.581935483870968e-05, + "loss": 0.1529, + "step": 9293 + }, + { + "epoch": 0.148704, + "grad_norm": 0.71875, + "learning_rate": 8.581774193548387e-05, + "loss": 0.1789, + "step": 9294 + }, + { + "epoch": 0.14872, + "grad_norm": 0.97265625, + "learning_rate": 8.581612903225807e-05, + "loss": 0.1746, + "step": 9295 + }, + { + "epoch": 0.148736, + "grad_norm": 0.60546875, + "learning_rate": 8.581451612903227e-05, + "loss": 0.1584, + "step": 9296 + }, + { + "epoch": 0.148752, + "grad_norm": 0.73046875, + "learning_rate": 8.581290322580646e-05, + "loss": 0.2083, + "step": 9297 + }, + { + "epoch": 0.148768, + "grad_norm": 0.609375, + "learning_rate": 8.581129032258066e-05, + "loss": 0.1449, + "step": 9298 + }, + { + "epoch": 0.148784, + "grad_norm": 0.75390625, + "learning_rate": 8.580967741935484e-05, + "loss": 0.2044, + "step": 9299 + }, + { + "epoch": 0.1488, + "grad_norm": 0.56640625, + "learning_rate": 8.580806451612904e-05, + "loss": 0.1985, + "step": 9300 + }, + { + "epoch": 0.148816, + "grad_norm": 1.484375, + "learning_rate": 8.580645161290323e-05, + "loss": 0.1821, + "step": 9301 + }, + { + "epoch": 0.148832, + "grad_norm": 0.71875, + "learning_rate": 8.580483870967743e-05, + "loss": 0.1357, + "step": 9302 + }, + { + "epoch": 0.148848, + "grad_norm": 0.75, + "learning_rate": 8.580322580645161e-05, + "loss": 0.1775, + "step": 9303 + }, + { + "epoch": 0.148864, + "grad_norm": 0.74609375, + "learning_rate": 8.580161290322581e-05, + "loss": 0.1783, + "step": 9304 + }, + { + "epoch": 0.14888, + "grad_norm": 1.3671875, + "learning_rate": 8.58e-05, + "loss": 0.1603, + "step": 9305 + }, + { + "epoch": 0.148896, + "grad_norm": 0.58203125, + "learning_rate": 8.57983870967742e-05, + "loss": 0.1593, + "step": 9306 + }, + { + "epoch": 0.148912, + "grad_norm": 0.76953125, + "learning_rate": 8.579677419354838e-05, + "loss": 0.217, + "step": 9307 + }, + { + "epoch": 0.148928, + "grad_norm": 1.0078125, + "learning_rate": 8.579516129032258e-05, + "loss": 0.1615, + "step": 9308 + }, + { + "epoch": 0.148944, + "grad_norm": 0.57421875, + "learning_rate": 8.579354838709678e-05, + "loss": 0.1526, + "step": 9309 + }, + { + "epoch": 0.14896, + "grad_norm": 0.73828125, + "learning_rate": 8.579193548387097e-05, + "loss": 0.1849, + "step": 9310 + }, + { + "epoch": 0.148976, + "grad_norm": 0.98828125, + "learning_rate": 8.579032258064517e-05, + "loss": 0.1426, + "step": 9311 + }, + { + "epoch": 0.148992, + "grad_norm": 0.6484375, + "learning_rate": 8.578870967741936e-05, + "loss": 0.165, + "step": 9312 + }, + { + "epoch": 0.149008, + "grad_norm": 0.71875, + "learning_rate": 8.578709677419355e-05, + "loss": 0.1934, + "step": 9313 + }, + { + "epoch": 0.149024, + "grad_norm": 0.640625, + "learning_rate": 8.578548387096774e-05, + "loss": 0.172, + "step": 9314 + }, + { + "epoch": 0.14904, + "grad_norm": 0.6953125, + "learning_rate": 8.578387096774194e-05, + "loss": 0.1915, + "step": 9315 + }, + { + "epoch": 0.149056, + "grad_norm": 0.81640625, + "learning_rate": 8.578225806451613e-05, + "loss": 0.1823, + "step": 9316 + }, + { + "epoch": 0.149072, + "grad_norm": 0.98828125, + "learning_rate": 8.578064516129033e-05, + "loss": 0.2217, + "step": 9317 + }, + { + "epoch": 0.149088, + "grad_norm": 0.69921875, + "learning_rate": 8.577903225806451e-05, + "loss": 0.191, + "step": 9318 + }, + { + "epoch": 0.149104, + "grad_norm": 1.0234375, + "learning_rate": 8.577741935483871e-05, + "loss": 0.1775, + "step": 9319 + }, + { + "epoch": 0.14912, + "grad_norm": 0.6171875, + "learning_rate": 8.577580645161291e-05, + "loss": 0.1297, + "step": 9320 + }, + { + "epoch": 0.149136, + "grad_norm": 0.64453125, + "learning_rate": 8.577419354838711e-05, + "loss": 0.1536, + "step": 9321 + }, + { + "epoch": 0.149152, + "grad_norm": 0.66796875, + "learning_rate": 8.57725806451613e-05, + "loss": 0.1684, + "step": 9322 + }, + { + "epoch": 0.149168, + "grad_norm": 1.484375, + "learning_rate": 8.57709677419355e-05, + "loss": 0.1786, + "step": 9323 + }, + { + "epoch": 0.149184, + "grad_norm": 0.83203125, + "learning_rate": 8.576935483870968e-05, + "loss": 0.189, + "step": 9324 + }, + { + "epoch": 0.1492, + "grad_norm": 0.73828125, + "learning_rate": 8.576774193548387e-05, + "loss": 0.1463, + "step": 9325 + }, + { + "epoch": 0.149216, + "grad_norm": 0.82421875, + "learning_rate": 8.576612903225807e-05, + "loss": 0.186, + "step": 9326 + }, + { + "epoch": 0.149232, + "grad_norm": 0.7421875, + "learning_rate": 8.576451612903225e-05, + "loss": 0.1734, + "step": 9327 + }, + { + "epoch": 0.149248, + "grad_norm": 0.8203125, + "learning_rate": 8.576290322580645e-05, + "loss": 0.172, + "step": 9328 + }, + { + "epoch": 0.149264, + "grad_norm": 1.0859375, + "learning_rate": 8.576129032258064e-05, + "loss": 0.207, + "step": 9329 + }, + { + "epoch": 0.14928, + "grad_norm": 0.6328125, + "learning_rate": 8.575967741935484e-05, + "loss": 0.17, + "step": 9330 + }, + { + "epoch": 0.149296, + "grad_norm": 0.625, + "learning_rate": 8.575806451612904e-05, + "loss": 0.1743, + "step": 9331 + }, + { + "epoch": 0.149312, + "grad_norm": 0.63671875, + "learning_rate": 8.575645161290323e-05, + "loss": 0.1624, + "step": 9332 + }, + { + "epoch": 0.149328, + "grad_norm": 1.1015625, + "learning_rate": 8.575483870967743e-05, + "loss": 0.156, + "step": 9333 + }, + { + "epoch": 0.149344, + "grad_norm": 0.796875, + "learning_rate": 8.575322580645163e-05, + "loss": 0.1639, + "step": 9334 + }, + { + "epoch": 0.14936, + "grad_norm": 0.62109375, + "learning_rate": 8.575161290322581e-05, + "loss": 0.1681, + "step": 9335 + }, + { + "epoch": 0.149376, + "grad_norm": 1.0625, + "learning_rate": 8.575000000000001e-05, + "loss": 0.194, + "step": 9336 + }, + { + "epoch": 0.149392, + "grad_norm": 0.72265625, + "learning_rate": 8.57483870967742e-05, + "loss": 0.1976, + "step": 9337 + }, + { + "epoch": 0.149408, + "grad_norm": 0.9609375, + "learning_rate": 8.57467741935484e-05, + "loss": 0.186, + "step": 9338 + }, + { + "epoch": 0.149424, + "grad_norm": 0.66015625, + "learning_rate": 8.574516129032258e-05, + "loss": 0.1701, + "step": 9339 + }, + { + "epoch": 0.14944, + "grad_norm": 0.9296875, + "learning_rate": 8.574354838709678e-05, + "loss": 0.1763, + "step": 9340 + }, + { + "epoch": 0.149456, + "grad_norm": 0.7578125, + "learning_rate": 8.574193548387097e-05, + "loss": 0.1681, + "step": 9341 + }, + { + "epoch": 0.149472, + "grad_norm": 1.2265625, + "learning_rate": 8.574032258064515e-05, + "loss": 0.1704, + "step": 9342 + }, + { + "epoch": 0.149488, + "grad_norm": 0.98046875, + "learning_rate": 8.573870967741935e-05, + "loss": 0.2408, + "step": 9343 + }, + { + "epoch": 0.149504, + "grad_norm": 0.66015625, + "learning_rate": 8.573709677419355e-05, + "loss": 0.1683, + "step": 9344 + }, + { + "epoch": 0.14952, + "grad_norm": 0.828125, + "learning_rate": 8.573548387096775e-05, + "loss": 0.1707, + "step": 9345 + }, + { + "epoch": 0.149536, + "grad_norm": 1.0078125, + "learning_rate": 8.573387096774194e-05, + "loss": 0.1517, + "step": 9346 + }, + { + "epoch": 0.149552, + "grad_norm": 0.8046875, + "learning_rate": 8.573225806451614e-05, + "loss": 0.2082, + "step": 9347 + }, + { + "epoch": 0.149568, + "grad_norm": 0.5859375, + "learning_rate": 8.573064516129033e-05, + "loss": 0.1498, + "step": 9348 + }, + { + "epoch": 0.149584, + "grad_norm": 0.91015625, + "learning_rate": 8.572903225806452e-05, + "loss": 0.196, + "step": 9349 + }, + { + "epoch": 0.1496, + "grad_norm": 0.8203125, + "learning_rate": 8.572741935483871e-05, + "loss": 0.1956, + "step": 9350 + }, + { + "epoch": 0.149616, + "grad_norm": 1.2265625, + "learning_rate": 8.572580645161291e-05, + "loss": 0.1793, + "step": 9351 + }, + { + "epoch": 0.149632, + "grad_norm": 0.71875, + "learning_rate": 8.57241935483871e-05, + "loss": 0.1353, + "step": 9352 + }, + { + "epoch": 0.149648, + "grad_norm": 0.9609375, + "learning_rate": 8.57225806451613e-05, + "loss": 0.1653, + "step": 9353 + }, + { + "epoch": 0.149664, + "grad_norm": 0.81640625, + "learning_rate": 8.572096774193548e-05, + "loss": 0.1943, + "step": 9354 + }, + { + "epoch": 0.14968, + "grad_norm": 0.78515625, + "learning_rate": 8.571935483870968e-05, + "loss": 0.1833, + "step": 9355 + }, + { + "epoch": 0.149696, + "grad_norm": 0.58203125, + "learning_rate": 8.571774193548388e-05, + "loss": 0.1523, + "step": 9356 + }, + { + "epoch": 0.149712, + "grad_norm": 0.8515625, + "learning_rate": 8.571612903225807e-05, + "loss": 0.1827, + "step": 9357 + }, + { + "epoch": 0.149728, + "grad_norm": 0.66015625, + "learning_rate": 8.571451612903227e-05, + "loss": 0.1739, + "step": 9358 + }, + { + "epoch": 0.149744, + "grad_norm": 0.6875, + "learning_rate": 8.571290322580645e-05, + "loss": 0.1301, + "step": 9359 + }, + { + "epoch": 0.14976, + "grad_norm": 0.66015625, + "learning_rate": 8.571129032258065e-05, + "loss": 0.158, + "step": 9360 + }, + { + "epoch": 0.149776, + "grad_norm": 1.1171875, + "learning_rate": 8.570967741935484e-05, + "loss": 0.2253, + "step": 9361 + }, + { + "epoch": 0.149792, + "grad_norm": 0.82421875, + "learning_rate": 8.570806451612904e-05, + "loss": 0.1911, + "step": 9362 + }, + { + "epoch": 0.149808, + "grad_norm": 1.21875, + "learning_rate": 8.570645161290322e-05, + "loss": 0.1616, + "step": 9363 + }, + { + "epoch": 0.149824, + "grad_norm": 0.609375, + "learning_rate": 8.570483870967742e-05, + "loss": 0.1744, + "step": 9364 + }, + { + "epoch": 0.14984, + "grad_norm": 0.84375, + "learning_rate": 8.570322580645161e-05, + "loss": 0.2221, + "step": 9365 + }, + { + "epoch": 0.149856, + "grad_norm": 0.66796875, + "learning_rate": 8.570161290322581e-05, + "loss": 0.1657, + "step": 9366 + }, + { + "epoch": 0.149872, + "grad_norm": 1.0859375, + "learning_rate": 8.57e-05, + "loss": 0.1619, + "step": 9367 + }, + { + "epoch": 0.149888, + "grad_norm": 0.640625, + "learning_rate": 8.56983870967742e-05, + "loss": 0.1575, + "step": 9368 + }, + { + "epoch": 0.149904, + "grad_norm": 1.078125, + "learning_rate": 8.56967741935484e-05, + "loss": 0.1861, + "step": 9369 + }, + { + "epoch": 0.14992, + "grad_norm": 0.7265625, + "learning_rate": 8.56951612903226e-05, + "loss": 0.1803, + "step": 9370 + }, + { + "epoch": 0.149936, + "grad_norm": 0.75, + "learning_rate": 8.569354838709678e-05, + "loss": 0.1813, + "step": 9371 + }, + { + "epoch": 0.149952, + "grad_norm": 0.7109375, + "learning_rate": 8.569193548387097e-05, + "loss": 0.174, + "step": 9372 + }, + { + "epoch": 0.149968, + "grad_norm": 1.2421875, + "learning_rate": 8.569032258064517e-05, + "loss": 0.1911, + "step": 9373 + }, + { + "epoch": 0.149984, + "grad_norm": 1.21875, + "learning_rate": 8.568870967741935e-05, + "loss": 0.163, + "step": 9374 + }, + { + "epoch": 0.15, + "grad_norm": 0.78125, + "learning_rate": 8.568709677419355e-05, + "loss": 0.1633, + "step": 9375 + }, + { + "epoch": 0.150016, + "grad_norm": 1.3515625, + "learning_rate": 8.568548387096774e-05, + "loss": 0.1727, + "step": 9376 + }, + { + "epoch": 0.150032, + "grad_norm": 0.6640625, + "learning_rate": 8.568387096774194e-05, + "loss": 0.1606, + "step": 9377 + }, + { + "epoch": 0.150048, + "grad_norm": 0.73828125, + "learning_rate": 8.568225806451612e-05, + "loss": 0.1532, + "step": 9378 + }, + { + "epoch": 0.150064, + "grad_norm": 0.7265625, + "learning_rate": 8.568064516129032e-05, + "loss": 0.2024, + "step": 9379 + }, + { + "epoch": 0.15008, + "grad_norm": 0.875, + "learning_rate": 8.567903225806452e-05, + "loss": 0.1761, + "step": 9380 + }, + { + "epoch": 0.150096, + "grad_norm": 0.58203125, + "learning_rate": 8.567741935483872e-05, + "loss": 0.1766, + "step": 9381 + }, + { + "epoch": 0.150112, + "grad_norm": 1.1484375, + "learning_rate": 8.567580645161291e-05, + "loss": 0.173, + "step": 9382 + }, + { + "epoch": 0.150128, + "grad_norm": 1.1328125, + "learning_rate": 8.567419354838711e-05, + "loss": 0.174, + "step": 9383 + }, + { + "epoch": 0.150144, + "grad_norm": 0.7265625, + "learning_rate": 8.56725806451613e-05, + "loss": 0.1764, + "step": 9384 + }, + { + "epoch": 0.15016, + "grad_norm": 0.66015625, + "learning_rate": 8.56709677419355e-05, + "loss": 0.1629, + "step": 9385 + }, + { + "epoch": 0.150176, + "grad_norm": 0.82421875, + "learning_rate": 8.566935483870968e-05, + "loss": 0.2408, + "step": 9386 + }, + { + "epoch": 0.150192, + "grad_norm": 1.2734375, + "learning_rate": 8.566774193548388e-05, + "loss": 0.1644, + "step": 9387 + }, + { + "epoch": 0.150208, + "grad_norm": 0.64453125, + "learning_rate": 8.566612903225807e-05, + "loss": 0.1444, + "step": 9388 + }, + { + "epoch": 0.150224, + "grad_norm": 0.76171875, + "learning_rate": 8.566451612903225e-05, + "loss": 0.2106, + "step": 9389 + }, + { + "epoch": 0.15024, + "grad_norm": 0.546875, + "learning_rate": 8.566290322580645e-05, + "loss": 0.1757, + "step": 9390 + }, + { + "epoch": 0.150256, + "grad_norm": 0.7421875, + "learning_rate": 8.566129032258065e-05, + "loss": 0.1385, + "step": 9391 + }, + { + "epoch": 0.150272, + "grad_norm": 0.796875, + "learning_rate": 8.565967741935485e-05, + "loss": 0.1712, + "step": 9392 + }, + { + "epoch": 0.150288, + "grad_norm": 1.234375, + "learning_rate": 8.565806451612904e-05, + "loss": 0.2336, + "step": 9393 + }, + { + "epoch": 0.150304, + "grad_norm": 0.578125, + "learning_rate": 8.565645161290324e-05, + "loss": 0.1727, + "step": 9394 + }, + { + "epoch": 0.15032, + "grad_norm": 1.1640625, + "learning_rate": 8.565483870967742e-05, + "loss": 0.1614, + "step": 9395 + }, + { + "epoch": 0.150336, + "grad_norm": 0.609375, + "learning_rate": 8.565322580645162e-05, + "loss": 0.1353, + "step": 9396 + }, + { + "epoch": 0.150352, + "grad_norm": 0.828125, + "learning_rate": 8.565161290322581e-05, + "loss": 0.1606, + "step": 9397 + }, + { + "epoch": 0.150368, + "grad_norm": 1.1484375, + "learning_rate": 8.565000000000001e-05, + "loss": 0.1371, + "step": 9398 + }, + { + "epoch": 0.150384, + "grad_norm": 1.1640625, + "learning_rate": 8.56483870967742e-05, + "loss": 0.1696, + "step": 9399 + }, + { + "epoch": 0.1504, + "grad_norm": 0.921875, + "learning_rate": 8.56467741935484e-05, + "loss": 0.1822, + "step": 9400 + }, + { + "epoch": 0.150416, + "grad_norm": 0.703125, + "learning_rate": 8.564516129032258e-05, + "loss": 0.1714, + "step": 9401 + }, + { + "epoch": 0.150432, + "grad_norm": 0.98828125, + "learning_rate": 8.564354838709678e-05, + "loss": 0.1716, + "step": 9402 + }, + { + "epoch": 0.150448, + "grad_norm": 0.77734375, + "learning_rate": 8.564193548387097e-05, + "loss": 0.1891, + "step": 9403 + }, + { + "epoch": 0.150464, + "grad_norm": 0.94140625, + "learning_rate": 8.564032258064517e-05, + "loss": 0.2035, + "step": 9404 + }, + { + "epoch": 0.15048, + "grad_norm": 0.6484375, + "learning_rate": 8.563870967741937e-05, + "loss": 0.1404, + "step": 9405 + }, + { + "epoch": 0.150496, + "grad_norm": 1.6328125, + "learning_rate": 8.563709677419355e-05, + "loss": 0.1815, + "step": 9406 + }, + { + "epoch": 0.150512, + "grad_norm": 0.73828125, + "learning_rate": 8.563548387096775e-05, + "loss": 0.184, + "step": 9407 + }, + { + "epoch": 0.150528, + "grad_norm": 0.8515625, + "learning_rate": 8.563387096774194e-05, + "loss": 0.216, + "step": 9408 + }, + { + "epoch": 0.150544, + "grad_norm": 1.390625, + "learning_rate": 8.563225806451614e-05, + "loss": 0.1945, + "step": 9409 + }, + { + "epoch": 0.15056, + "grad_norm": 0.640625, + "learning_rate": 8.563064516129032e-05, + "loss": 0.1806, + "step": 9410 + }, + { + "epoch": 0.150576, + "grad_norm": 0.72265625, + "learning_rate": 8.562903225806452e-05, + "loss": 0.1391, + "step": 9411 + }, + { + "epoch": 0.150592, + "grad_norm": 0.796875, + "learning_rate": 8.562741935483871e-05, + "loss": 0.1709, + "step": 9412 + }, + { + "epoch": 0.150608, + "grad_norm": 1.0, + "learning_rate": 8.562580645161291e-05, + "loss": 0.2112, + "step": 9413 + }, + { + "epoch": 0.150624, + "grad_norm": 1.0546875, + "learning_rate": 8.56241935483871e-05, + "loss": 0.2165, + "step": 9414 + }, + { + "epoch": 0.15064, + "grad_norm": 0.71484375, + "learning_rate": 8.56225806451613e-05, + "loss": 0.2233, + "step": 9415 + }, + { + "epoch": 0.150656, + "grad_norm": 0.97265625, + "learning_rate": 8.56209677419355e-05, + "loss": 0.1594, + "step": 9416 + }, + { + "epoch": 0.150672, + "grad_norm": 0.8125, + "learning_rate": 8.561935483870969e-05, + "loss": 0.1732, + "step": 9417 + }, + { + "epoch": 0.150688, + "grad_norm": 0.703125, + "learning_rate": 8.561774193548388e-05, + "loss": 0.1991, + "step": 9418 + }, + { + "epoch": 0.150704, + "grad_norm": 1.6171875, + "learning_rate": 8.561612903225807e-05, + "loss": 0.1762, + "step": 9419 + }, + { + "epoch": 0.15072, + "grad_norm": 0.953125, + "learning_rate": 8.561451612903226e-05, + "loss": 0.1748, + "step": 9420 + }, + { + "epoch": 0.150736, + "grad_norm": 0.82421875, + "learning_rate": 8.561290322580645e-05, + "loss": 0.1278, + "step": 9421 + }, + { + "epoch": 0.150752, + "grad_norm": 0.77734375, + "learning_rate": 8.561129032258065e-05, + "loss": 0.1779, + "step": 9422 + }, + { + "epoch": 0.150768, + "grad_norm": 0.7578125, + "learning_rate": 8.560967741935484e-05, + "loss": 0.1817, + "step": 9423 + }, + { + "epoch": 0.150784, + "grad_norm": 1.4140625, + "learning_rate": 8.560806451612904e-05, + "loss": 0.2275, + "step": 9424 + }, + { + "epoch": 0.1508, + "grad_norm": 1.03125, + "learning_rate": 8.560645161290322e-05, + "loss": 0.1815, + "step": 9425 + }, + { + "epoch": 0.150816, + "grad_norm": 0.7578125, + "learning_rate": 8.560483870967742e-05, + "loss": 0.1904, + "step": 9426 + }, + { + "epoch": 0.150832, + "grad_norm": 0.6640625, + "learning_rate": 8.560322580645161e-05, + "loss": 0.173, + "step": 9427 + }, + { + "epoch": 0.150848, + "grad_norm": 1.0078125, + "learning_rate": 8.560161290322581e-05, + "loss": 0.2064, + "step": 9428 + }, + { + "epoch": 0.150864, + "grad_norm": 1.4140625, + "learning_rate": 8.560000000000001e-05, + "loss": 0.1987, + "step": 9429 + }, + { + "epoch": 0.15088, + "grad_norm": 1.1640625, + "learning_rate": 8.55983870967742e-05, + "loss": 0.1796, + "step": 9430 + }, + { + "epoch": 0.150896, + "grad_norm": 0.984375, + "learning_rate": 8.559677419354839e-05, + "loss": 0.1833, + "step": 9431 + }, + { + "epoch": 0.150912, + "grad_norm": 0.75390625, + "learning_rate": 8.559516129032259e-05, + "loss": 0.1677, + "step": 9432 + }, + { + "epoch": 0.150928, + "grad_norm": 0.984375, + "learning_rate": 8.559354838709678e-05, + "loss": 0.2181, + "step": 9433 + }, + { + "epoch": 0.150944, + "grad_norm": 0.7265625, + "learning_rate": 8.559193548387096e-05, + "loss": 0.1937, + "step": 9434 + }, + { + "epoch": 0.15096, + "grad_norm": 0.66015625, + "learning_rate": 8.559032258064516e-05, + "loss": 0.1727, + "step": 9435 + }, + { + "epoch": 0.150976, + "grad_norm": 1.125, + "learning_rate": 8.558870967741935e-05, + "loss": 0.2172, + "step": 9436 + }, + { + "epoch": 0.150992, + "grad_norm": 0.8125, + "learning_rate": 8.558709677419355e-05, + "loss": 0.1818, + "step": 9437 + }, + { + "epoch": 0.151008, + "grad_norm": 0.77734375, + "learning_rate": 8.558548387096774e-05, + "loss": 0.1732, + "step": 9438 + }, + { + "epoch": 0.151024, + "grad_norm": 0.84375, + "learning_rate": 8.558387096774194e-05, + "loss": 0.1891, + "step": 9439 + }, + { + "epoch": 0.15104, + "grad_norm": 0.66015625, + "learning_rate": 8.558225806451614e-05, + "loss": 0.1982, + "step": 9440 + }, + { + "epoch": 0.151056, + "grad_norm": 0.859375, + "learning_rate": 8.558064516129033e-05, + "loss": 0.1835, + "step": 9441 + }, + { + "epoch": 0.151072, + "grad_norm": 0.859375, + "learning_rate": 8.557903225806452e-05, + "loss": 0.1646, + "step": 9442 + }, + { + "epoch": 0.151088, + "grad_norm": 1.0234375, + "learning_rate": 8.557741935483872e-05, + "loss": 0.1628, + "step": 9443 + }, + { + "epoch": 0.151104, + "grad_norm": 0.80078125, + "learning_rate": 8.55758064516129e-05, + "loss": 0.1696, + "step": 9444 + }, + { + "epoch": 0.15112, + "grad_norm": 0.72265625, + "learning_rate": 8.55741935483871e-05, + "loss": 0.1302, + "step": 9445 + }, + { + "epoch": 0.151136, + "grad_norm": 0.546875, + "learning_rate": 8.557258064516129e-05, + "loss": 0.1411, + "step": 9446 + }, + { + "epoch": 0.151152, + "grad_norm": 1.03125, + "learning_rate": 8.557096774193549e-05, + "loss": 0.1632, + "step": 9447 + }, + { + "epoch": 0.151168, + "grad_norm": 0.75, + "learning_rate": 8.556935483870968e-05, + "loss": 0.1802, + "step": 9448 + }, + { + "epoch": 0.151184, + "grad_norm": 0.8125, + "learning_rate": 8.556774193548388e-05, + "loss": 0.1865, + "step": 9449 + }, + { + "epoch": 0.1512, + "grad_norm": 0.58203125, + "learning_rate": 8.556612903225806e-05, + "loss": 0.1346, + "step": 9450 + }, + { + "epoch": 0.151216, + "grad_norm": 0.5234375, + "learning_rate": 8.556451612903226e-05, + "loss": 0.1715, + "step": 9451 + }, + { + "epoch": 0.151232, + "grad_norm": 0.765625, + "learning_rate": 8.556290322580646e-05, + "loss": 0.1757, + "step": 9452 + }, + { + "epoch": 0.151248, + "grad_norm": 0.72265625, + "learning_rate": 8.556129032258065e-05, + "loss": 0.186, + "step": 9453 + }, + { + "epoch": 0.151264, + "grad_norm": 0.70703125, + "learning_rate": 8.555967741935485e-05, + "loss": 0.1396, + "step": 9454 + }, + { + "epoch": 0.15128, + "grad_norm": 0.86328125, + "learning_rate": 8.555806451612903e-05, + "loss": 0.2025, + "step": 9455 + }, + { + "epoch": 0.151296, + "grad_norm": 0.7265625, + "learning_rate": 8.555645161290323e-05, + "loss": 0.1561, + "step": 9456 + }, + { + "epoch": 0.151312, + "grad_norm": 0.83984375, + "learning_rate": 8.555483870967742e-05, + "loss": 0.1697, + "step": 9457 + }, + { + "epoch": 0.151328, + "grad_norm": 0.80859375, + "learning_rate": 8.555322580645162e-05, + "loss": 0.1809, + "step": 9458 + }, + { + "epoch": 0.151344, + "grad_norm": 0.73828125, + "learning_rate": 8.55516129032258e-05, + "loss": 0.1514, + "step": 9459 + }, + { + "epoch": 0.15136, + "grad_norm": 0.8671875, + "learning_rate": 8.555e-05, + "loss": 0.1665, + "step": 9460 + }, + { + "epoch": 0.151376, + "grad_norm": 1.3125, + "learning_rate": 8.554838709677419e-05, + "loss": 0.2005, + "step": 9461 + }, + { + "epoch": 0.151392, + "grad_norm": 1.125, + "learning_rate": 8.554677419354839e-05, + "loss": 0.1957, + "step": 9462 + }, + { + "epoch": 0.151408, + "grad_norm": 0.73046875, + "learning_rate": 8.554516129032258e-05, + "loss": 0.1795, + "step": 9463 + }, + { + "epoch": 0.151424, + "grad_norm": 0.8125, + "learning_rate": 8.554354838709678e-05, + "loss": 0.1843, + "step": 9464 + }, + { + "epoch": 0.15144, + "grad_norm": 0.71875, + "learning_rate": 8.554193548387098e-05, + "loss": 0.2006, + "step": 9465 + }, + { + "epoch": 0.151456, + "grad_norm": 0.9453125, + "learning_rate": 8.554032258064516e-05, + "loss": 0.225, + "step": 9466 + }, + { + "epoch": 0.151472, + "grad_norm": 0.73828125, + "learning_rate": 8.553870967741936e-05, + "loss": 0.2182, + "step": 9467 + }, + { + "epoch": 0.151488, + "grad_norm": 0.431640625, + "learning_rate": 8.553709677419355e-05, + "loss": 0.1492, + "step": 9468 + }, + { + "epoch": 0.151504, + "grad_norm": 0.6640625, + "learning_rate": 8.553548387096775e-05, + "loss": 0.1472, + "step": 9469 + }, + { + "epoch": 0.15152, + "grad_norm": 1.078125, + "learning_rate": 8.553387096774193e-05, + "loss": 0.1565, + "step": 9470 + }, + { + "epoch": 0.151536, + "grad_norm": 0.490234375, + "learning_rate": 8.553225806451613e-05, + "loss": 0.134, + "step": 9471 + }, + { + "epoch": 0.151552, + "grad_norm": 0.6953125, + "learning_rate": 8.553064516129032e-05, + "loss": 0.1893, + "step": 9472 + }, + { + "epoch": 0.151568, + "grad_norm": 0.953125, + "learning_rate": 8.552903225806452e-05, + "loss": 0.1809, + "step": 9473 + }, + { + "epoch": 0.151584, + "grad_norm": 0.64453125, + "learning_rate": 8.55274193548387e-05, + "loss": 0.2087, + "step": 9474 + }, + { + "epoch": 0.1516, + "grad_norm": 0.79296875, + "learning_rate": 8.55258064516129e-05, + "loss": 0.171, + "step": 9475 + }, + { + "epoch": 0.151616, + "grad_norm": 0.9453125, + "learning_rate": 8.55241935483871e-05, + "loss": 0.1637, + "step": 9476 + }, + { + "epoch": 0.151632, + "grad_norm": 0.6953125, + "learning_rate": 8.55225806451613e-05, + "loss": 0.1465, + "step": 9477 + }, + { + "epoch": 0.151648, + "grad_norm": 1.1171875, + "learning_rate": 8.552096774193549e-05, + "loss": 0.2088, + "step": 9478 + }, + { + "epoch": 0.151664, + "grad_norm": 0.65625, + "learning_rate": 8.551935483870969e-05, + "loss": 0.1697, + "step": 9479 + }, + { + "epoch": 0.15168, + "grad_norm": 1.015625, + "learning_rate": 8.551774193548388e-05, + "loss": 0.1842, + "step": 9480 + }, + { + "epoch": 0.151696, + "grad_norm": 0.4453125, + "learning_rate": 8.551612903225806e-05, + "loss": 0.108, + "step": 9481 + }, + { + "epoch": 0.151712, + "grad_norm": 1.15625, + "learning_rate": 8.551451612903226e-05, + "loss": 0.1502, + "step": 9482 + }, + { + "epoch": 0.151728, + "grad_norm": 0.6484375, + "learning_rate": 8.551290322580645e-05, + "loss": 0.1577, + "step": 9483 + }, + { + "epoch": 0.151744, + "grad_norm": 0.76171875, + "learning_rate": 8.551129032258065e-05, + "loss": 0.1507, + "step": 9484 + }, + { + "epoch": 0.15176, + "grad_norm": 0.7890625, + "learning_rate": 8.550967741935483e-05, + "loss": 0.1572, + "step": 9485 + }, + { + "epoch": 0.151776, + "grad_norm": 0.80078125, + "learning_rate": 8.550806451612903e-05, + "loss": 0.1637, + "step": 9486 + }, + { + "epoch": 0.151792, + "grad_norm": 0.49609375, + "learning_rate": 8.550645161290323e-05, + "loss": 0.1634, + "step": 9487 + }, + { + "epoch": 0.151808, + "grad_norm": 0.68359375, + "learning_rate": 8.550483870967743e-05, + "loss": 0.1729, + "step": 9488 + }, + { + "epoch": 0.151824, + "grad_norm": 0.79296875, + "learning_rate": 8.550322580645162e-05, + "loss": 0.1672, + "step": 9489 + }, + { + "epoch": 0.15184, + "grad_norm": 0.7734375, + "learning_rate": 8.550161290322582e-05, + "loss": 0.1696, + "step": 9490 + }, + { + "epoch": 0.151856, + "grad_norm": 0.63671875, + "learning_rate": 8.55e-05, + "loss": 0.1748, + "step": 9491 + }, + { + "epoch": 0.151872, + "grad_norm": 0.75390625, + "learning_rate": 8.54983870967742e-05, + "loss": 0.1717, + "step": 9492 + }, + { + "epoch": 0.151888, + "grad_norm": 0.7265625, + "learning_rate": 8.549677419354839e-05, + "loss": 0.1851, + "step": 9493 + }, + { + "epoch": 0.151904, + "grad_norm": 0.8125, + "learning_rate": 8.549516129032259e-05, + "loss": 0.1774, + "step": 9494 + }, + { + "epoch": 0.15192, + "grad_norm": 0.57421875, + "learning_rate": 8.549354838709678e-05, + "loss": 0.1457, + "step": 9495 + }, + { + "epoch": 0.151936, + "grad_norm": 0.76171875, + "learning_rate": 8.549193548387096e-05, + "loss": 0.162, + "step": 9496 + }, + { + "epoch": 0.151952, + "grad_norm": 1.078125, + "learning_rate": 8.549032258064516e-05, + "loss": 0.2258, + "step": 9497 + }, + { + "epoch": 0.151968, + "grad_norm": 0.68359375, + "learning_rate": 8.548870967741935e-05, + "loss": 0.1453, + "step": 9498 + }, + { + "epoch": 0.151984, + "grad_norm": 0.890625, + "learning_rate": 8.548709677419355e-05, + "loss": 0.1585, + "step": 9499 + }, + { + "epoch": 0.152, + "grad_norm": 0.89453125, + "learning_rate": 8.548548387096775e-05, + "loss": 0.1796, + "step": 9500 + }, + { + "epoch": 0.152016, + "grad_norm": 0.80078125, + "learning_rate": 8.548387096774195e-05, + "loss": 0.1722, + "step": 9501 + }, + { + "epoch": 0.152032, + "grad_norm": 1.03125, + "learning_rate": 8.548225806451613e-05, + "loss": 0.2111, + "step": 9502 + }, + { + "epoch": 0.152048, + "grad_norm": 0.84375, + "learning_rate": 8.548064516129033e-05, + "loss": 0.1554, + "step": 9503 + }, + { + "epoch": 0.152064, + "grad_norm": 0.80078125, + "learning_rate": 8.547903225806452e-05, + "loss": 0.1795, + "step": 9504 + }, + { + "epoch": 0.15208, + "grad_norm": 0.78515625, + "learning_rate": 8.547741935483872e-05, + "loss": 0.1764, + "step": 9505 + }, + { + "epoch": 0.152096, + "grad_norm": 0.78515625, + "learning_rate": 8.54758064516129e-05, + "loss": 0.1715, + "step": 9506 + }, + { + "epoch": 0.152112, + "grad_norm": 0.88671875, + "learning_rate": 8.54741935483871e-05, + "loss": 0.1692, + "step": 9507 + }, + { + "epoch": 0.152128, + "grad_norm": 0.828125, + "learning_rate": 8.547258064516129e-05, + "loss": 0.1264, + "step": 9508 + }, + { + "epoch": 0.152144, + "grad_norm": 1.0625, + "learning_rate": 8.547096774193549e-05, + "loss": 0.217, + "step": 9509 + }, + { + "epoch": 0.15216, + "grad_norm": 0.6953125, + "learning_rate": 8.546935483870968e-05, + "loss": 0.1596, + "step": 9510 + }, + { + "epoch": 0.152176, + "grad_norm": 1.1171875, + "learning_rate": 8.546774193548388e-05, + "loss": 0.1884, + "step": 9511 + }, + { + "epoch": 0.152192, + "grad_norm": 0.87109375, + "learning_rate": 8.546612903225807e-05, + "loss": 0.1585, + "step": 9512 + }, + { + "epoch": 0.152208, + "grad_norm": 0.81640625, + "learning_rate": 8.546451612903226e-05, + "loss": 0.1908, + "step": 9513 + }, + { + "epoch": 0.152224, + "grad_norm": 0.578125, + "learning_rate": 8.546290322580646e-05, + "loss": 0.2143, + "step": 9514 + }, + { + "epoch": 0.15224, + "grad_norm": 0.6328125, + "learning_rate": 8.546129032258065e-05, + "loss": 0.1535, + "step": 9515 + }, + { + "epoch": 0.152256, + "grad_norm": 0.7890625, + "learning_rate": 8.545967741935485e-05, + "loss": 0.1883, + "step": 9516 + }, + { + "epoch": 0.152272, + "grad_norm": 0.75, + "learning_rate": 8.545806451612903e-05, + "loss": 0.1787, + "step": 9517 + }, + { + "epoch": 0.152288, + "grad_norm": 0.89453125, + "learning_rate": 8.545645161290323e-05, + "loss": 0.1479, + "step": 9518 + }, + { + "epoch": 0.152304, + "grad_norm": 0.8984375, + "learning_rate": 8.545483870967742e-05, + "loss": 0.1699, + "step": 9519 + }, + { + "epoch": 0.15232, + "grad_norm": 0.7265625, + "learning_rate": 8.545322580645162e-05, + "loss": 0.1988, + "step": 9520 + }, + { + "epoch": 0.152336, + "grad_norm": 0.72265625, + "learning_rate": 8.54516129032258e-05, + "loss": 0.1872, + "step": 9521 + }, + { + "epoch": 0.152352, + "grad_norm": 0.76171875, + "learning_rate": 8.545e-05, + "loss": 0.1729, + "step": 9522 + }, + { + "epoch": 0.152368, + "grad_norm": 1.3515625, + "learning_rate": 8.544838709677419e-05, + "loss": 0.1864, + "step": 9523 + }, + { + "epoch": 0.152384, + "grad_norm": 0.75, + "learning_rate": 8.544677419354839e-05, + "loss": 0.1382, + "step": 9524 + }, + { + "epoch": 0.1524, + "grad_norm": 0.57421875, + "learning_rate": 8.544516129032259e-05, + "loss": 0.147, + "step": 9525 + }, + { + "epoch": 0.152416, + "grad_norm": 1.1875, + "learning_rate": 8.544354838709679e-05, + "loss": 0.2038, + "step": 9526 + }, + { + "epoch": 0.152432, + "grad_norm": 0.703125, + "learning_rate": 8.544193548387097e-05, + "loss": 0.1375, + "step": 9527 + }, + { + "epoch": 0.152448, + "grad_norm": 0.93359375, + "learning_rate": 8.544032258064516e-05, + "loss": 0.1635, + "step": 9528 + }, + { + "epoch": 0.152464, + "grad_norm": 1.1875, + "learning_rate": 8.543870967741936e-05, + "loss": 0.1805, + "step": 9529 + }, + { + "epoch": 0.15248, + "grad_norm": 0.953125, + "learning_rate": 8.543709677419355e-05, + "loss": 0.208, + "step": 9530 + }, + { + "epoch": 0.152496, + "grad_norm": 0.890625, + "learning_rate": 8.543548387096775e-05, + "loss": 0.1738, + "step": 9531 + }, + { + "epoch": 0.152512, + "grad_norm": 0.8125, + "learning_rate": 8.543387096774193e-05, + "loss": 0.1956, + "step": 9532 + }, + { + "epoch": 0.152528, + "grad_norm": 0.8828125, + "learning_rate": 8.543225806451613e-05, + "loss": 0.1413, + "step": 9533 + }, + { + "epoch": 0.152544, + "grad_norm": 0.9296875, + "learning_rate": 8.543064516129032e-05, + "loss": 0.1717, + "step": 9534 + }, + { + "epoch": 0.15256, + "grad_norm": 0.9453125, + "learning_rate": 8.542903225806452e-05, + "loss": 0.2188, + "step": 9535 + }, + { + "epoch": 0.152576, + "grad_norm": 0.84375, + "learning_rate": 8.542741935483872e-05, + "loss": 0.1855, + "step": 9536 + }, + { + "epoch": 0.152592, + "grad_norm": 0.87109375, + "learning_rate": 8.542580645161292e-05, + "loss": 0.2004, + "step": 9537 + }, + { + "epoch": 0.152608, + "grad_norm": 0.54296875, + "learning_rate": 8.54241935483871e-05, + "loss": 0.1265, + "step": 9538 + }, + { + "epoch": 0.152624, + "grad_norm": 0.73046875, + "learning_rate": 8.54225806451613e-05, + "loss": 0.1546, + "step": 9539 + }, + { + "epoch": 0.15264, + "grad_norm": 0.75, + "learning_rate": 8.542096774193549e-05, + "loss": 0.1522, + "step": 9540 + }, + { + "epoch": 0.152656, + "grad_norm": 0.84765625, + "learning_rate": 8.541935483870969e-05, + "loss": 0.1705, + "step": 9541 + }, + { + "epoch": 0.152672, + "grad_norm": 0.79296875, + "learning_rate": 8.541774193548387e-05, + "loss": 0.1684, + "step": 9542 + }, + { + "epoch": 0.152688, + "grad_norm": 0.625, + "learning_rate": 8.541612903225806e-05, + "loss": 0.1836, + "step": 9543 + }, + { + "epoch": 0.152704, + "grad_norm": 0.9453125, + "learning_rate": 8.541451612903226e-05, + "loss": 0.1786, + "step": 9544 + }, + { + "epoch": 0.15272, + "grad_norm": 0.74609375, + "learning_rate": 8.541290322580645e-05, + "loss": 0.1879, + "step": 9545 + }, + { + "epoch": 0.152736, + "grad_norm": 0.73828125, + "learning_rate": 8.541129032258065e-05, + "loss": 0.1595, + "step": 9546 + }, + { + "epoch": 0.152752, + "grad_norm": 0.85546875, + "learning_rate": 8.540967741935485e-05, + "loss": 0.1707, + "step": 9547 + }, + { + "epoch": 0.152768, + "grad_norm": 0.99609375, + "learning_rate": 8.540806451612904e-05, + "loss": 0.1702, + "step": 9548 + }, + { + "epoch": 0.152784, + "grad_norm": 0.6875, + "learning_rate": 8.540645161290323e-05, + "loss": 0.1666, + "step": 9549 + }, + { + "epoch": 0.1528, + "grad_norm": 1.125, + "learning_rate": 8.540483870967743e-05, + "loss": 0.1656, + "step": 9550 + }, + { + "epoch": 0.152816, + "grad_norm": 0.79296875, + "learning_rate": 8.540322580645162e-05, + "loss": 0.1395, + "step": 9551 + }, + { + "epoch": 0.152832, + "grad_norm": 0.765625, + "learning_rate": 8.540161290322582e-05, + "loss": 0.197, + "step": 9552 + }, + { + "epoch": 0.152848, + "grad_norm": 0.859375, + "learning_rate": 8.54e-05, + "loss": 0.201, + "step": 9553 + }, + { + "epoch": 0.152864, + "grad_norm": 0.7109375, + "learning_rate": 8.53983870967742e-05, + "loss": 0.157, + "step": 9554 + }, + { + "epoch": 0.15288, + "grad_norm": 0.71484375, + "learning_rate": 8.539677419354839e-05, + "loss": 0.1738, + "step": 9555 + }, + { + "epoch": 0.152896, + "grad_norm": 0.6796875, + "learning_rate": 8.539516129032259e-05, + "loss": 0.1627, + "step": 9556 + }, + { + "epoch": 0.152912, + "grad_norm": 0.67578125, + "learning_rate": 8.539354838709677e-05, + "loss": 0.1687, + "step": 9557 + }, + { + "epoch": 0.152928, + "grad_norm": 0.56640625, + "learning_rate": 8.539193548387097e-05, + "loss": 0.1912, + "step": 9558 + }, + { + "epoch": 0.152944, + "grad_norm": 0.8359375, + "learning_rate": 8.539032258064516e-05, + "loss": 0.1859, + "step": 9559 + }, + { + "epoch": 0.15296, + "grad_norm": 0.8515625, + "learning_rate": 8.538870967741936e-05, + "loss": 0.1343, + "step": 9560 + }, + { + "epoch": 0.152976, + "grad_norm": 0.75390625, + "learning_rate": 8.538709677419356e-05, + "loss": 0.1818, + "step": 9561 + }, + { + "epoch": 0.152992, + "grad_norm": 0.828125, + "learning_rate": 8.538548387096774e-05, + "loss": 0.1439, + "step": 9562 + }, + { + "epoch": 0.153008, + "grad_norm": 0.9140625, + "learning_rate": 8.538387096774194e-05, + "loss": 0.1671, + "step": 9563 + }, + { + "epoch": 0.153024, + "grad_norm": 1.2421875, + "learning_rate": 8.538225806451613e-05, + "loss": 0.1818, + "step": 9564 + }, + { + "epoch": 0.15304, + "grad_norm": 0.609375, + "learning_rate": 8.538064516129033e-05, + "loss": 0.1828, + "step": 9565 + }, + { + "epoch": 0.153056, + "grad_norm": 0.9375, + "learning_rate": 8.537903225806452e-05, + "loss": 0.167, + "step": 9566 + }, + { + "epoch": 0.153072, + "grad_norm": 0.94921875, + "learning_rate": 8.537741935483872e-05, + "loss": 0.1955, + "step": 9567 + }, + { + "epoch": 0.153088, + "grad_norm": 0.9453125, + "learning_rate": 8.53758064516129e-05, + "loss": 0.1926, + "step": 9568 + }, + { + "epoch": 0.153104, + "grad_norm": 1.0, + "learning_rate": 8.53741935483871e-05, + "loss": 0.1721, + "step": 9569 + }, + { + "epoch": 0.15312, + "grad_norm": 1.984375, + "learning_rate": 8.537258064516129e-05, + "loss": 0.1622, + "step": 9570 + }, + { + "epoch": 0.153136, + "grad_norm": 1.4921875, + "learning_rate": 8.537096774193549e-05, + "loss": 0.1536, + "step": 9571 + }, + { + "epoch": 0.153152, + "grad_norm": 1.1796875, + "learning_rate": 8.536935483870969e-05, + "loss": 0.2177, + "step": 9572 + }, + { + "epoch": 0.153168, + "grad_norm": 0.765625, + "learning_rate": 8.536774193548389e-05, + "loss": 0.1532, + "step": 9573 + }, + { + "epoch": 0.153184, + "grad_norm": 0.828125, + "learning_rate": 8.536612903225807e-05, + "loss": 0.1818, + "step": 9574 + }, + { + "epoch": 0.1532, + "grad_norm": 1.0546875, + "learning_rate": 8.536451612903226e-05, + "loss": 0.1745, + "step": 9575 + }, + { + "epoch": 0.153216, + "grad_norm": 1.171875, + "learning_rate": 8.536290322580646e-05, + "loss": 0.1936, + "step": 9576 + }, + { + "epoch": 0.153232, + "grad_norm": 0.92578125, + "learning_rate": 8.536129032258064e-05, + "loss": 0.1915, + "step": 9577 + }, + { + "epoch": 0.153248, + "grad_norm": 0.984375, + "learning_rate": 8.535967741935484e-05, + "loss": 0.141, + "step": 9578 + }, + { + "epoch": 0.153264, + "grad_norm": 0.76953125, + "learning_rate": 8.535806451612903e-05, + "loss": 0.1674, + "step": 9579 + }, + { + "epoch": 0.15328, + "grad_norm": 0.65625, + "learning_rate": 8.535645161290323e-05, + "loss": 0.1739, + "step": 9580 + }, + { + "epoch": 0.153296, + "grad_norm": 0.75390625, + "learning_rate": 8.535483870967742e-05, + "loss": 0.2014, + "step": 9581 + }, + { + "epoch": 0.153312, + "grad_norm": 1.0703125, + "learning_rate": 8.535322580645162e-05, + "loss": 0.2053, + "step": 9582 + }, + { + "epoch": 0.153328, + "grad_norm": 0.7890625, + "learning_rate": 8.535161290322581e-05, + "loss": 0.1737, + "step": 9583 + }, + { + "epoch": 0.153344, + "grad_norm": 0.62890625, + "learning_rate": 8.535e-05, + "loss": 0.175, + "step": 9584 + }, + { + "epoch": 0.15336, + "grad_norm": 0.6796875, + "learning_rate": 8.53483870967742e-05, + "loss": 0.1594, + "step": 9585 + }, + { + "epoch": 0.153376, + "grad_norm": 1.078125, + "learning_rate": 8.53467741935484e-05, + "loss": 0.1609, + "step": 9586 + }, + { + "epoch": 0.153392, + "grad_norm": 0.6484375, + "learning_rate": 8.534516129032259e-05, + "loss": 0.1274, + "step": 9587 + }, + { + "epoch": 0.153408, + "grad_norm": 0.98828125, + "learning_rate": 8.534354838709679e-05, + "loss": 0.2117, + "step": 9588 + }, + { + "epoch": 0.153424, + "grad_norm": 0.7734375, + "learning_rate": 8.534193548387097e-05, + "loss": 0.1871, + "step": 9589 + }, + { + "epoch": 0.15344, + "grad_norm": 0.80078125, + "learning_rate": 8.534032258064516e-05, + "loss": 0.1701, + "step": 9590 + }, + { + "epoch": 0.153456, + "grad_norm": 0.8125, + "learning_rate": 8.533870967741936e-05, + "loss": 0.172, + "step": 9591 + }, + { + "epoch": 0.153472, + "grad_norm": 0.796875, + "learning_rate": 8.533709677419354e-05, + "loss": 0.1453, + "step": 9592 + }, + { + "epoch": 0.153488, + "grad_norm": 0.7578125, + "learning_rate": 8.533548387096774e-05, + "loss": 0.1272, + "step": 9593 + }, + { + "epoch": 0.153504, + "grad_norm": 0.61328125, + "learning_rate": 8.533387096774193e-05, + "loss": 0.1389, + "step": 9594 + }, + { + "epoch": 0.15352, + "grad_norm": 0.75, + "learning_rate": 8.533225806451613e-05, + "loss": 0.2078, + "step": 9595 + }, + { + "epoch": 0.153536, + "grad_norm": 0.6171875, + "learning_rate": 8.533064516129033e-05, + "loss": 0.1552, + "step": 9596 + }, + { + "epoch": 0.153552, + "grad_norm": 1.1015625, + "learning_rate": 8.532903225806453e-05, + "loss": 0.28, + "step": 9597 + }, + { + "epoch": 0.153568, + "grad_norm": 0.66015625, + "learning_rate": 8.532741935483871e-05, + "loss": 0.1797, + "step": 9598 + }, + { + "epoch": 0.153584, + "grad_norm": 0.6171875, + "learning_rate": 8.532580645161291e-05, + "loss": 0.1698, + "step": 9599 + }, + { + "epoch": 0.1536, + "grad_norm": 0.69921875, + "learning_rate": 8.53241935483871e-05, + "loss": 0.1498, + "step": 9600 + }, + { + "epoch": 0.153616, + "grad_norm": 0.734375, + "learning_rate": 8.53225806451613e-05, + "loss": 0.1979, + "step": 9601 + }, + { + "epoch": 0.153632, + "grad_norm": 0.75, + "learning_rate": 8.532096774193549e-05, + "loss": 0.1652, + "step": 9602 + }, + { + "epoch": 0.153648, + "grad_norm": 0.6328125, + "learning_rate": 8.531935483870969e-05, + "loss": 0.1194, + "step": 9603 + }, + { + "epoch": 0.153664, + "grad_norm": 1.390625, + "learning_rate": 8.531774193548387e-05, + "loss": 0.2174, + "step": 9604 + }, + { + "epoch": 0.15368, + "grad_norm": 1.2578125, + "learning_rate": 8.531612903225806e-05, + "loss": 0.177, + "step": 9605 + }, + { + "epoch": 0.153696, + "grad_norm": 0.6796875, + "learning_rate": 8.531451612903226e-05, + "loss": 0.1633, + "step": 9606 + }, + { + "epoch": 0.153712, + "grad_norm": 0.75390625, + "learning_rate": 8.531290322580646e-05, + "loss": 0.1691, + "step": 9607 + }, + { + "epoch": 0.153728, + "grad_norm": 0.5859375, + "learning_rate": 8.531129032258066e-05, + "loss": 0.1749, + "step": 9608 + }, + { + "epoch": 0.153744, + "grad_norm": 0.5546875, + "learning_rate": 8.530967741935484e-05, + "loss": 0.1683, + "step": 9609 + }, + { + "epoch": 0.15376, + "grad_norm": 1.578125, + "learning_rate": 8.530806451612904e-05, + "loss": 0.211, + "step": 9610 + }, + { + "epoch": 0.153776, + "grad_norm": 0.734375, + "learning_rate": 8.530645161290323e-05, + "loss": 0.2147, + "step": 9611 + }, + { + "epoch": 0.153792, + "grad_norm": 1.0859375, + "learning_rate": 8.530483870967743e-05, + "loss": 0.2048, + "step": 9612 + }, + { + "epoch": 0.153808, + "grad_norm": 0.734375, + "learning_rate": 8.530322580645161e-05, + "loss": 0.1729, + "step": 9613 + }, + { + "epoch": 0.153824, + "grad_norm": 0.6796875, + "learning_rate": 8.530161290322581e-05, + "loss": 0.1689, + "step": 9614 + }, + { + "epoch": 0.15384, + "grad_norm": 0.9453125, + "learning_rate": 8.53e-05, + "loss": 0.214, + "step": 9615 + }, + { + "epoch": 0.153856, + "grad_norm": 0.546875, + "learning_rate": 8.52983870967742e-05, + "loss": 0.1716, + "step": 9616 + }, + { + "epoch": 0.153872, + "grad_norm": 0.6875, + "learning_rate": 8.529677419354839e-05, + "loss": 0.1927, + "step": 9617 + }, + { + "epoch": 0.153888, + "grad_norm": 0.796875, + "learning_rate": 8.529516129032259e-05, + "loss": 0.2084, + "step": 9618 + }, + { + "epoch": 0.153904, + "grad_norm": 0.8203125, + "learning_rate": 8.529354838709677e-05, + "loss": 0.1603, + "step": 9619 + }, + { + "epoch": 0.15392, + "grad_norm": 0.796875, + "learning_rate": 8.529193548387097e-05, + "loss": 0.2044, + "step": 9620 + }, + { + "epoch": 0.153936, + "grad_norm": 0.578125, + "learning_rate": 8.529032258064517e-05, + "loss": 0.1577, + "step": 9621 + }, + { + "epoch": 0.153952, + "grad_norm": 0.98828125, + "learning_rate": 8.528870967741936e-05, + "loss": 0.1512, + "step": 9622 + }, + { + "epoch": 0.153968, + "grad_norm": 0.890625, + "learning_rate": 8.528709677419356e-05, + "loss": 0.1583, + "step": 9623 + }, + { + "epoch": 0.153984, + "grad_norm": 0.8125, + "learning_rate": 8.528548387096774e-05, + "loss": 0.1671, + "step": 9624 + }, + { + "epoch": 0.154, + "grad_norm": 0.5703125, + "learning_rate": 8.528387096774194e-05, + "loss": 0.1754, + "step": 9625 + }, + { + "epoch": 0.154016, + "grad_norm": 0.69140625, + "learning_rate": 8.528225806451613e-05, + "loss": 0.2, + "step": 9626 + }, + { + "epoch": 0.154032, + "grad_norm": 0.734375, + "learning_rate": 8.528064516129033e-05, + "loss": 0.1531, + "step": 9627 + }, + { + "epoch": 0.154048, + "grad_norm": 0.6328125, + "learning_rate": 8.527903225806451e-05, + "loss": 0.158, + "step": 9628 + }, + { + "epoch": 0.154064, + "grad_norm": 1.1171875, + "learning_rate": 8.527741935483871e-05, + "loss": 0.1856, + "step": 9629 + }, + { + "epoch": 0.15408, + "grad_norm": 0.9765625, + "learning_rate": 8.52758064516129e-05, + "loss": 0.1572, + "step": 9630 + }, + { + "epoch": 0.154096, + "grad_norm": 0.671875, + "learning_rate": 8.52741935483871e-05, + "loss": 0.1636, + "step": 9631 + }, + { + "epoch": 0.154112, + "grad_norm": 0.68359375, + "learning_rate": 8.52725806451613e-05, + "loss": 0.2023, + "step": 9632 + }, + { + "epoch": 0.154128, + "grad_norm": 0.78515625, + "learning_rate": 8.52709677419355e-05, + "loss": 0.1904, + "step": 9633 + }, + { + "epoch": 0.154144, + "grad_norm": 0.83203125, + "learning_rate": 8.526935483870968e-05, + "loss": 0.1544, + "step": 9634 + }, + { + "epoch": 0.15416, + "grad_norm": 0.466796875, + "learning_rate": 8.526774193548388e-05, + "loss": 0.1366, + "step": 9635 + }, + { + "epoch": 0.154176, + "grad_norm": 0.5703125, + "learning_rate": 8.526612903225807e-05, + "loss": 0.1692, + "step": 9636 + }, + { + "epoch": 0.154192, + "grad_norm": 0.423828125, + "learning_rate": 8.526451612903226e-05, + "loss": 0.135, + "step": 9637 + }, + { + "epoch": 0.154208, + "grad_norm": 0.89453125, + "learning_rate": 8.526290322580646e-05, + "loss": 0.1717, + "step": 9638 + }, + { + "epoch": 0.154224, + "grad_norm": 0.8515625, + "learning_rate": 8.526129032258064e-05, + "loss": 0.1564, + "step": 9639 + }, + { + "epoch": 0.15424, + "grad_norm": 0.828125, + "learning_rate": 8.525967741935484e-05, + "loss": 0.1665, + "step": 9640 + }, + { + "epoch": 0.154256, + "grad_norm": 0.85546875, + "learning_rate": 8.525806451612903e-05, + "loss": 0.1545, + "step": 9641 + }, + { + "epoch": 0.154272, + "grad_norm": 0.87890625, + "learning_rate": 8.525645161290323e-05, + "loss": 0.2267, + "step": 9642 + }, + { + "epoch": 0.154288, + "grad_norm": 0.66015625, + "learning_rate": 8.525483870967743e-05, + "loss": 0.176, + "step": 9643 + }, + { + "epoch": 0.154304, + "grad_norm": 0.8828125, + "learning_rate": 8.525322580645163e-05, + "loss": 0.1932, + "step": 9644 + }, + { + "epoch": 0.15432, + "grad_norm": 1.078125, + "learning_rate": 8.525161290322581e-05, + "loss": 0.1851, + "step": 9645 + }, + { + "epoch": 0.154336, + "grad_norm": 0.60546875, + "learning_rate": 8.525000000000001e-05, + "loss": 0.186, + "step": 9646 + }, + { + "epoch": 0.154352, + "grad_norm": 0.8984375, + "learning_rate": 8.52483870967742e-05, + "loss": 0.1979, + "step": 9647 + }, + { + "epoch": 0.154368, + "grad_norm": 0.6171875, + "learning_rate": 8.52467741935484e-05, + "loss": 0.1379, + "step": 9648 + }, + { + "epoch": 0.154384, + "grad_norm": 0.7734375, + "learning_rate": 8.524516129032258e-05, + "loss": 0.1844, + "step": 9649 + }, + { + "epoch": 0.1544, + "grad_norm": 0.8203125, + "learning_rate": 8.524354838709678e-05, + "loss": 0.1747, + "step": 9650 + }, + { + "epoch": 0.154416, + "grad_norm": 0.91015625, + "learning_rate": 8.524193548387097e-05, + "loss": 0.1814, + "step": 9651 + }, + { + "epoch": 0.154432, + "grad_norm": 0.8671875, + "learning_rate": 8.524032258064516e-05, + "loss": 0.152, + "step": 9652 + }, + { + "epoch": 0.154448, + "grad_norm": 0.77734375, + "learning_rate": 8.523870967741936e-05, + "loss": 0.1391, + "step": 9653 + }, + { + "epoch": 0.154464, + "grad_norm": 0.6171875, + "learning_rate": 8.523709677419354e-05, + "loss": 0.1704, + "step": 9654 + }, + { + "epoch": 0.15448, + "grad_norm": 0.609375, + "learning_rate": 8.523548387096774e-05, + "loss": 0.155, + "step": 9655 + }, + { + "epoch": 0.154496, + "grad_norm": 0.671875, + "learning_rate": 8.523387096774194e-05, + "loss": 0.1626, + "step": 9656 + }, + { + "epoch": 0.154512, + "grad_norm": 0.89453125, + "learning_rate": 8.523225806451614e-05, + "loss": 0.2259, + "step": 9657 + }, + { + "epoch": 0.154528, + "grad_norm": 0.8125, + "learning_rate": 8.523064516129033e-05, + "loss": 0.1606, + "step": 9658 + }, + { + "epoch": 0.154544, + "grad_norm": 0.9296875, + "learning_rate": 8.522903225806453e-05, + "loss": 0.1984, + "step": 9659 + }, + { + "epoch": 0.15456, + "grad_norm": 0.609375, + "learning_rate": 8.522741935483871e-05, + "loss": 0.1833, + "step": 9660 + }, + { + "epoch": 0.154576, + "grad_norm": 0.62890625, + "learning_rate": 8.522580645161291e-05, + "loss": 0.173, + "step": 9661 + }, + { + "epoch": 0.154592, + "grad_norm": 0.7734375, + "learning_rate": 8.52241935483871e-05, + "loss": 0.2, + "step": 9662 + }, + { + "epoch": 0.154608, + "grad_norm": 0.9453125, + "learning_rate": 8.52225806451613e-05, + "loss": 0.1624, + "step": 9663 + }, + { + "epoch": 0.154624, + "grad_norm": 0.59765625, + "learning_rate": 8.522096774193548e-05, + "loss": 0.1483, + "step": 9664 + }, + { + "epoch": 0.15464, + "grad_norm": 0.9453125, + "learning_rate": 8.521935483870968e-05, + "loss": 0.1663, + "step": 9665 + }, + { + "epoch": 0.154656, + "grad_norm": 0.96875, + "learning_rate": 8.521774193548387e-05, + "loss": 0.2033, + "step": 9666 + }, + { + "epoch": 0.154672, + "grad_norm": 0.71484375, + "learning_rate": 8.521612903225807e-05, + "loss": 0.1767, + "step": 9667 + }, + { + "epoch": 0.154688, + "grad_norm": 0.9921875, + "learning_rate": 8.521451612903227e-05, + "loss": 0.1791, + "step": 9668 + }, + { + "epoch": 0.154704, + "grad_norm": 1.0859375, + "learning_rate": 8.521290322580645e-05, + "loss": 0.1398, + "step": 9669 + }, + { + "epoch": 0.15472, + "grad_norm": 0.7265625, + "learning_rate": 8.521129032258065e-05, + "loss": 0.1317, + "step": 9670 + }, + { + "epoch": 0.154736, + "grad_norm": 0.72265625, + "learning_rate": 8.520967741935484e-05, + "loss": 0.1501, + "step": 9671 + }, + { + "epoch": 0.154752, + "grad_norm": 0.625, + "learning_rate": 8.520806451612904e-05, + "loss": 0.1763, + "step": 9672 + }, + { + "epoch": 0.154768, + "grad_norm": 0.6953125, + "learning_rate": 8.520645161290323e-05, + "loss": 0.1999, + "step": 9673 + }, + { + "epoch": 0.154784, + "grad_norm": 0.498046875, + "learning_rate": 8.520483870967743e-05, + "loss": 0.1717, + "step": 9674 + }, + { + "epoch": 0.1548, + "grad_norm": 0.51953125, + "learning_rate": 8.520322580645161e-05, + "loss": 0.1774, + "step": 9675 + }, + { + "epoch": 0.154816, + "grad_norm": 0.5703125, + "learning_rate": 8.520161290322581e-05, + "loss": 0.1868, + "step": 9676 + }, + { + "epoch": 0.154832, + "grad_norm": 0.625, + "learning_rate": 8.52e-05, + "loss": 0.1448, + "step": 9677 + }, + { + "epoch": 0.154848, + "grad_norm": 0.65625, + "learning_rate": 8.51983870967742e-05, + "loss": 0.1543, + "step": 9678 + }, + { + "epoch": 0.154864, + "grad_norm": 0.73828125, + "learning_rate": 8.519677419354838e-05, + "loss": 0.1789, + "step": 9679 + }, + { + "epoch": 0.15488, + "grad_norm": 0.56640625, + "learning_rate": 8.519516129032258e-05, + "loss": 0.1494, + "step": 9680 + }, + { + "epoch": 0.154896, + "grad_norm": 0.6484375, + "learning_rate": 8.519354838709678e-05, + "loss": 0.1372, + "step": 9681 + }, + { + "epoch": 0.154912, + "grad_norm": 0.96875, + "learning_rate": 8.519193548387098e-05, + "loss": 0.187, + "step": 9682 + }, + { + "epoch": 0.154928, + "grad_norm": 1.046875, + "learning_rate": 8.519032258064517e-05, + "loss": 0.2058, + "step": 9683 + }, + { + "epoch": 0.154944, + "grad_norm": 0.83984375, + "learning_rate": 8.518870967741935e-05, + "loss": 0.2087, + "step": 9684 + }, + { + "epoch": 0.15496, + "grad_norm": 0.98046875, + "learning_rate": 8.518709677419355e-05, + "loss": 0.1849, + "step": 9685 + }, + { + "epoch": 0.154976, + "grad_norm": 0.89453125, + "learning_rate": 8.518548387096774e-05, + "loss": 0.1959, + "step": 9686 + }, + { + "epoch": 0.154992, + "grad_norm": 0.828125, + "learning_rate": 8.518387096774194e-05, + "loss": 0.2087, + "step": 9687 + }, + { + "epoch": 0.155008, + "grad_norm": 0.7421875, + "learning_rate": 8.518225806451613e-05, + "loss": 0.1807, + "step": 9688 + }, + { + "epoch": 0.155024, + "grad_norm": 1.1484375, + "learning_rate": 8.518064516129033e-05, + "loss": 0.1505, + "step": 9689 + }, + { + "epoch": 0.15504, + "grad_norm": 1.296875, + "learning_rate": 8.517903225806451e-05, + "loss": 0.1715, + "step": 9690 + }, + { + "epoch": 0.155056, + "grad_norm": 0.671875, + "learning_rate": 8.517741935483871e-05, + "loss": 0.1768, + "step": 9691 + }, + { + "epoch": 0.155072, + "grad_norm": 0.8125, + "learning_rate": 8.517580645161291e-05, + "loss": 0.1456, + "step": 9692 + }, + { + "epoch": 0.155088, + "grad_norm": 1.1484375, + "learning_rate": 8.517419354838711e-05, + "loss": 0.1814, + "step": 9693 + }, + { + "epoch": 0.155104, + "grad_norm": 0.90234375, + "learning_rate": 8.51725806451613e-05, + "loss": 0.1501, + "step": 9694 + }, + { + "epoch": 0.15512, + "grad_norm": 0.59765625, + "learning_rate": 8.51709677419355e-05, + "loss": 0.184, + "step": 9695 + }, + { + "epoch": 0.155136, + "grad_norm": 1.15625, + "learning_rate": 8.516935483870968e-05, + "loss": 0.1991, + "step": 9696 + }, + { + "epoch": 0.155152, + "grad_norm": 0.6796875, + "learning_rate": 8.516774193548388e-05, + "loss": 0.1732, + "step": 9697 + }, + { + "epoch": 0.155168, + "grad_norm": 0.73046875, + "learning_rate": 8.516612903225807e-05, + "loss": 0.1617, + "step": 9698 + }, + { + "epoch": 0.155184, + "grad_norm": 0.85546875, + "learning_rate": 8.516451612903225e-05, + "loss": 0.1858, + "step": 9699 + }, + { + "epoch": 0.1552, + "grad_norm": 0.98046875, + "learning_rate": 8.516290322580645e-05, + "loss": 0.1705, + "step": 9700 + }, + { + "epoch": 0.155216, + "grad_norm": 1.1875, + "learning_rate": 8.516129032258064e-05, + "loss": 0.1614, + "step": 9701 + }, + { + "epoch": 0.155232, + "grad_norm": 0.6484375, + "learning_rate": 8.515967741935484e-05, + "loss": 0.1679, + "step": 9702 + }, + { + "epoch": 0.155248, + "grad_norm": 0.671875, + "learning_rate": 8.515806451612904e-05, + "loss": 0.1661, + "step": 9703 + }, + { + "epoch": 0.155264, + "grad_norm": 1.140625, + "learning_rate": 8.515645161290324e-05, + "loss": 0.1899, + "step": 9704 + }, + { + "epoch": 0.15528, + "grad_norm": 0.99609375, + "learning_rate": 8.515483870967742e-05, + "loss": 0.14, + "step": 9705 + }, + { + "epoch": 0.155296, + "grad_norm": 1.1640625, + "learning_rate": 8.515322580645162e-05, + "loss": 0.1739, + "step": 9706 + }, + { + "epoch": 0.155312, + "grad_norm": 0.71875, + "learning_rate": 8.515161290322581e-05, + "loss": 0.189, + "step": 9707 + }, + { + "epoch": 0.155328, + "grad_norm": 0.66796875, + "learning_rate": 8.515000000000001e-05, + "loss": 0.1325, + "step": 9708 + }, + { + "epoch": 0.155344, + "grad_norm": 0.81640625, + "learning_rate": 8.51483870967742e-05, + "loss": 0.2029, + "step": 9709 + }, + { + "epoch": 0.15536, + "grad_norm": 0.5546875, + "learning_rate": 8.51467741935484e-05, + "loss": 0.1495, + "step": 9710 + }, + { + "epoch": 0.155376, + "grad_norm": 0.75, + "learning_rate": 8.514516129032258e-05, + "loss": 0.1912, + "step": 9711 + }, + { + "epoch": 0.155392, + "grad_norm": 0.7734375, + "learning_rate": 8.514354838709678e-05, + "loss": 0.1976, + "step": 9712 + }, + { + "epoch": 0.155408, + "grad_norm": 1.078125, + "learning_rate": 8.514193548387097e-05, + "loss": 0.1829, + "step": 9713 + }, + { + "epoch": 0.155424, + "grad_norm": 0.9609375, + "learning_rate": 8.514032258064515e-05, + "loss": 0.2139, + "step": 9714 + }, + { + "epoch": 0.15544, + "grad_norm": 0.8828125, + "learning_rate": 8.513870967741935e-05, + "loss": 0.1601, + "step": 9715 + }, + { + "epoch": 0.155456, + "grad_norm": 0.875, + "learning_rate": 8.513709677419355e-05, + "loss": 0.2173, + "step": 9716 + }, + { + "epoch": 0.155472, + "grad_norm": 1.3828125, + "learning_rate": 8.513548387096775e-05, + "loss": 0.1999, + "step": 9717 + }, + { + "epoch": 0.155488, + "grad_norm": 0.86328125, + "learning_rate": 8.513387096774194e-05, + "loss": 0.222, + "step": 9718 + }, + { + "epoch": 0.155504, + "grad_norm": 0.75390625, + "learning_rate": 8.513225806451614e-05, + "loss": 0.1812, + "step": 9719 + }, + { + "epoch": 0.15552, + "grad_norm": 1.0390625, + "learning_rate": 8.513064516129032e-05, + "loss": 0.196, + "step": 9720 + }, + { + "epoch": 0.155536, + "grad_norm": 1.015625, + "learning_rate": 8.512903225806452e-05, + "loss": 0.1462, + "step": 9721 + }, + { + "epoch": 0.155552, + "grad_norm": 1.0234375, + "learning_rate": 8.512741935483871e-05, + "loss": 0.1752, + "step": 9722 + }, + { + "epoch": 0.155568, + "grad_norm": 0.6484375, + "learning_rate": 8.512580645161291e-05, + "loss": 0.1428, + "step": 9723 + }, + { + "epoch": 0.155584, + "grad_norm": 0.8671875, + "learning_rate": 8.51241935483871e-05, + "loss": 0.1187, + "step": 9724 + }, + { + "epoch": 0.1556, + "grad_norm": 0.82421875, + "learning_rate": 8.51225806451613e-05, + "loss": 0.1806, + "step": 9725 + }, + { + "epoch": 0.155616, + "grad_norm": 0.890625, + "learning_rate": 8.512096774193548e-05, + "loss": 0.1949, + "step": 9726 + }, + { + "epoch": 0.155632, + "grad_norm": 0.8671875, + "learning_rate": 8.511935483870968e-05, + "loss": 0.1526, + "step": 9727 + }, + { + "epoch": 0.155648, + "grad_norm": 0.66015625, + "learning_rate": 8.511774193548388e-05, + "loss": 0.16, + "step": 9728 + }, + { + "epoch": 0.155664, + "grad_norm": 1.28125, + "learning_rate": 8.511612903225808e-05, + "loss": 0.1694, + "step": 9729 + }, + { + "epoch": 0.15568, + "grad_norm": 0.7421875, + "learning_rate": 8.511451612903227e-05, + "loss": 0.2019, + "step": 9730 + }, + { + "epoch": 0.155696, + "grad_norm": 0.97265625, + "learning_rate": 8.511290322580645e-05, + "loss": 0.1835, + "step": 9731 + }, + { + "epoch": 0.155712, + "grad_norm": 0.59375, + "learning_rate": 8.511129032258065e-05, + "loss": 0.1889, + "step": 9732 + }, + { + "epoch": 0.155728, + "grad_norm": 1.2421875, + "learning_rate": 8.510967741935484e-05, + "loss": 0.1779, + "step": 9733 + }, + { + "epoch": 0.155744, + "grad_norm": 1.0078125, + "learning_rate": 8.510806451612904e-05, + "loss": 0.1791, + "step": 9734 + }, + { + "epoch": 0.15576, + "grad_norm": 1.3828125, + "learning_rate": 8.510645161290322e-05, + "loss": 0.2105, + "step": 9735 + }, + { + "epoch": 0.155776, + "grad_norm": 0.640625, + "learning_rate": 8.510483870967742e-05, + "loss": 0.1457, + "step": 9736 + }, + { + "epoch": 0.155792, + "grad_norm": 0.9921875, + "learning_rate": 8.510322580645161e-05, + "loss": 0.1977, + "step": 9737 + }, + { + "epoch": 0.155808, + "grad_norm": 1.0625, + "learning_rate": 8.510161290322581e-05, + "loss": 0.2022, + "step": 9738 + }, + { + "epoch": 0.155824, + "grad_norm": 1.2734375, + "learning_rate": 8.510000000000001e-05, + "loss": 0.1814, + "step": 9739 + }, + { + "epoch": 0.15584, + "grad_norm": 0.70703125, + "learning_rate": 8.509838709677421e-05, + "loss": 0.1598, + "step": 9740 + }, + { + "epoch": 0.155856, + "grad_norm": 0.6640625, + "learning_rate": 8.50967741935484e-05, + "loss": 0.1874, + "step": 9741 + }, + { + "epoch": 0.155872, + "grad_norm": 0.73828125, + "learning_rate": 8.50951612903226e-05, + "loss": 0.173, + "step": 9742 + }, + { + "epoch": 0.155888, + "grad_norm": 0.76953125, + "learning_rate": 8.509354838709678e-05, + "loss": 0.1377, + "step": 9743 + }, + { + "epoch": 0.155904, + "grad_norm": 1.6484375, + "learning_rate": 8.509193548387098e-05, + "loss": 0.1955, + "step": 9744 + }, + { + "epoch": 0.15592, + "grad_norm": 1.1953125, + "learning_rate": 8.509032258064517e-05, + "loss": 0.1699, + "step": 9745 + }, + { + "epoch": 0.155936, + "grad_norm": 1.984375, + "learning_rate": 8.508870967741935e-05, + "loss": 0.1688, + "step": 9746 + }, + { + "epoch": 0.155952, + "grad_norm": 0.78125, + "learning_rate": 8.508709677419355e-05, + "loss": 0.1671, + "step": 9747 + }, + { + "epoch": 0.155968, + "grad_norm": 0.6171875, + "learning_rate": 8.508548387096774e-05, + "loss": 0.1794, + "step": 9748 + }, + { + "epoch": 0.155984, + "grad_norm": 0.9609375, + "learning_rate": 8.508387096774194e-05, + "loss": 0.1756, + "step": 9749 + }, + { + "epoch": 0.156, + "grad_norm": 0.828125, + "learning_rate": 8.508225806451612e-05, + "loss": 0.1527, + "step": 9750 + }, + { + "epoch": 0.156016, + "grad_norm": 1.015625, + "learning_rate": 8.508064516129032e-05, + "loss": 0.1784, + "step": 9751 + }, + { + "epoch": 0.156032, + "grad_norm": 0.9921875, + "learning_rate": 8.507903225806452e-05, + "loss": 0.1606, + "step": 9752 + }, + { + "epoch": 0.156048, + "grad_norm": 0.80078125, + "learning_rate": 8.507741935483872e-05, + "loss": 0.1921, + "step": 9753 + }, + { + "epoch": 0.156064, + "grad_norm": 1.28125, + "learning_rate": 8.507580645161291e-05, + "loss": 0.1985, + "step": 9754 + }, + { + "epoch": 0.15608, + "grad_norm": 0.8203125, + "learning_rate": 8.507419354838711e-05, + "loss": 0.1731, + "step": 9755 + }, + { + "epoch": 0.156096, + "grad_norm": 0.68359375, + "learning_rate": 8.50725806451613e-05, + "loss": 0.1882, + "step": 9756 + }, + { + "epoch": 0.156112, + "grad_norm": 0.8828125, + "learning_rate": 8.50709677419355e-05, + "loss": 0.1772, + "step": 9757 + }, + { + "epoch": 0.156128, + "grad_norm": 0.83984375, + "learning_rate": 8.506935483870968e-05, + "loss": 0.1723, + "step": 9758 + }, + { + "epoch": 0.156144, + "grad_norm": 0.7265625, + "learning_rate": 8.506774193548388e-05, + "loss": 0.1696, + "step": 9759 + }, + { + "epoch": 0.15616, + "grad_norm": 0.9140625, + "learning_rate": 8.506612903225807e-05, + "loss": 0.1701, + "step": 9760 + }, + { + "epoch": 0.156176, + "grad_norm": 0.66796875, + "learning_rate": 8.506451612903225e-05, + "loss": 0.1732, + "step": 9761 + }, + { + "epoch": 0.156192, + "grad_norm": 0.7734375, + "learning_rate": 8.506290322580645e-05, + "loss": 0.2059, + "step": 9762 + }, + { + "epoch": 0.156208, + "grad_norm": 0.65234375, + "learning_rate": 8.506129032258065e-05, + "loss": 0.2289, + "step": 9763 + }, + { + "epoch": 0.156224, + "grad_norm": 0.7890625, + "learning_rate": 8.505967741935485e-05, + "loss": 0.154, + "step": 9764 + }, + { + "epoch": 0.15624, + "grad_norm": 0.51171875, + "learning_rate": 8.505806451612904e-05, + "loss": 0.1734, + "step": 9765 + }, + { + "epoch": 0.156256, + "grad_norm": 1.4375, + "learning_rate": 8.505645161290324e-05, + "loss": 0.1865, + "step": 9766 + }, + { + "epoch": 0.156272, + "grad_norm": 1.078125, + "learning_rate": 8.505483870967742e-05, + "loss": 0.1799, + "step": 9767 + }, + { + "epoch": 0.156288, + "grad_norm": 0.7734375, + "learning_rate": 8.505322580645162e-05, + "loss": 0.1563, + "step": 9768 + }, + { + "epoch": 0.156304, + "grad_norm": 0.74609375, + "learning_rate": 8.505161290322581e-05, + "loss": 0.1641, + "step": 9769 + }, + { + "epoch": 0.15632, + "grad_norm": 1.0625, + "learning_rate": 8.505000000000001e-05, + "loss": 0.1726, + "step": 9770 + }, + { + "epoch": 0.156336, + "grad_norm": 0.9140625, + "learning_rate": 8.50483870967742e-05, + "loss": 0.2057, + "step": 9771 + }, + { + "epoch": 0.156352, + "grad_norm": 1.0078125, + "learning_rate": 8.504677419354839e-05, + "loss": 0.1849, + "step": 9772 + }, + { + "epoch": 0.156368, + "grad_norm": 0.8203125, + "learning_rate": 8.504516129032258e-05, + "loss": 0.165, + "step": 9773 + }, + { + "epoch": 0.156384, + "grad_norm": 0.5, + "learning_rate": 8.504354838709678e-05, + "loss": 0.1651, + "step": 9774 + }, + { + "epoch": 0.1564, + "grad_norm": 0.97265625, + "learning_rate": 8.504193548387096e-05, + "loss": 0.1673, + "step": 9775 + }, + { + "epoch": 0.156416, + "grad_norm": 0.55859375, + "learning_rate": 8.504032258064516e-05, + "loss": 0.1337, + "step": 9776 + }, + { + "epoch": 0.156432, + "grad_norm": 1.3515625, + "learning_rate": 8.503870967741936e-05, + "loss": 0.208, + "step": 9777 + }, + { + "epoch": 0.156448, + "grad_norm": 0.68359375, + "learning_rate": 8.503709677419355e-05, + "loss": 0.2025, + "step": 9778 + }, + { + "epoch": 0.156464, + "grad_norm": 1.15625, + "learning_rate": 8.503548387096775e-05, + "loss": 0.2053, + "step": 9779 + }, + { + "epoch": 0.15648, + "grad_norm": 0.55859375, + "learning_rate": 8.503387096774194e-05, + "loss": 0.154, + "step": 9780 + }, + { + "epoch": 0.156496, + "grad_norm": 0.87890625, + "learning_rate": 8.503225806451614e-05, + "loss": 0.159, + "step": 9781 + }, + { + "epoch": 0.156512, + "grad_norm": 0.76171875, + "learning_rate": 8.503064516129032e-05, + "loss": 0.1515, + "step": 9782 + }, + { + "epoch": 0.156528, + "grad_norm": 0.734375, + "learning_rate": 8.502903225806452e-05, + "loss": 0.1816, + "step": 9783 + }, + { + "epoch": 0.156544, + "grad_norm": 0.6171875, + "learning_rate": 8.502741935483871e-05, + "loss": 0.1474, + "step": 9784 + }, + { + "epoch": 0.15656, + "grad_norm": 0.86328125, + "learning_rate": 8.502580645161291e-05, + "loss": 0.1578, + "step": 9785 + }, + { + "epoch": 0.156576, + "grad_norm": 0.703125, + "learning_rate": 8.502419354838709e-05, + "loss": 0.16, + "step": 9786 + }, + { + "epoch": 0.156592, + "grad_norm": 0.703125, + "learning_rate": 8.502258064516129e-05, + "loss": 0.1891, + "step": 9787 + }, + { + "epoch": 0.156608, + "grad_norm": 0.99609375, + "learning_rate": 8.502096774193549e-05, + "loss": 0.1899, + "step": 9788 + }, + { + "epoch": 0.156624, + "grad_norm": 0.8203125, + "learning_rate": 8.501935483870969e-05, + "loss": 0.1946, + "step": 9789 + }, + { + "epoch": 0.15664, + "grad_norm": 0.82421875, + "learning_rate": 8.501774193548388e-05, + "loss": 0.1594, + "step": 9790 + }, + { + "epoch": 0.156656, + "grad_norm": 1.1484375, + "learning_rate": 8.501612903225808e-05, + "loss": 0.1829, + "step": 9791 + }, + { + "epoch": 0.156672, + "grad_norm": 0.62109375, + "learning_rate": 8.501451612903226e-05, + "loss": 0.2085, + "step": 9792 + }, + { + "epoch": 0.156688, + "grad_norm": 1.0859375, + "learning_rate": 8.501290322580645e-05, + "loss": 0.184, + "step": 9793 + }, + { + "epoch": 0.156704, + "grad_norm": 0.5390625, + "learning_rate": 8.501129032258065e-05, + "loss": 0.1634, + "step": 9794 + }, + { + "epoch": 0.15672, + "grad_norm": 0.84765625, + "learning_rate": 8.500967741935484e-05, + "loss": 0.1782, + "step": 9795 + }, + { + "epoch": 0.156736, + "grad_norm": 0.74609375, + "learning_rate": 8.500806451612904e-05, + "loss": 0.157, + "step": 9796 + }, + { + "epoch": 0.156752, + "grad_norm": 0.73828125, + "learning_rate": 8.500645161290322e-05, + "loss": 0.1493, + "step": 9797 + }, + { + "epoch": 0.156768, + "grad_norm": 0.85546875, + "learning_rate": 8.500483870967742e-05, + "loss": 0.1719, + "step": 9798 + }, + { + "epoch": 0.156784, + "grad_norm": 0.5625, + "learning_rate": 8.500322580645162e-05, + "loss": 0.1394, + "step": 9799 + }, + { + "epoch": 0.1568, + "grad_norm": 0.69921875, + "learning_rate": 8.500161290322582e-05, + "loss": 0.1757, + "step": 9800 + }, + { + "epoch": 0.156816, + "grad_norm": 0.8828125, + "learning_rate": 8.5e-05, + "loss": 0.1597, + "step": 9801 + }, + { + "epoch": 0.156832, + "grad_norm": 1.0078125, + "learning_rate": 8.49983870967742e-05, + "loss": 0.205, + "step": 9802 + }, + { + "epoch": 0.156848, + "grad_norm": 0.76171875, + "learning_rate": 8.499677419354839e-05, + "loss": 0.1346, + "step": 9803 + }, + { + "epoch": 0.156864, + "grad_norm": 0.98046875, + "learning_rate": 8.499516129032259e-05, + "loss": 0.2072, + "step": 9804 + }, + { + "epoch": 0.15688, + "grad_norm": 0.984375, + "learning_rate": 8.499354838709678e-05, + "loss": 0.1332, + "step": 9805 + }, + { + "epoch": 0.156896, + "grad_norm": 0.71484375, + "learning_rate": 8.499193548387098e-05, + "loss": 0.2004, + "step": 9806 + }, + { + "epoch": 0.156912, + "grad_norm": 0.5625, + "learning_rate": 8.499032258064516e-05, + "loss": 0.1454, + "step": 9807 + }, + { + "epoch": 0.156928, + "grad_norm": 0.88671875, + "learning_rate": 8.498870967741935e-05, + "loss": 0.1731, + "step": 9808 + }, + { + "epoch": 0.156944, + "grad_norm": 0.96484375, + "learning_rate": 8.498709677419355e-05, + "loss": 0.1639, + "step": 9809 + }, + { + "epoch": 0.15696, + "grad_norm": 0.890625, + "learning_rate": 8.498548387096774e-05, + "loss": 0.1486, + "step": 9810 + }, + { + "epoch": 0.156976, + "grad_norm": 1.1640625, + "learning_rate": 8.498387096774193e-05, + "loss": 0.1439, + "step": 9811 + }, + { + "epoch": 0.156992, + "grad_norm": 1.1796875, + "learning_rate": 8.498225806451613e-05, + "loss": 0.1962, + "step": 9812 + }, + { + "epoch": 0.157008, + "grad_norm": 1.0703125, + "learning_rate": 8.498064516129033e-05, + "loss": 0.1723, + "step": 9813 + }, + { + "epoch": 0.157024, + "grad_norm": 1.21875, + "learning_rate": 8.497903225806452e-05, + "loss": 0.1839, + "step": 9814 + }, + { + "epoch": 0.15704, + "grad_norm": 0.6484375, + "learning_rate": 8.497741935483872e-05, + "loss": 0.1562, + "step": 9815 + }, + { + "epoch": 0.157056, + "grad_norm": 1.203125, + "learning_rate": 8.49758064516129e-05, + "loss": 0.2088, + "step": 9816 + }, + { + "epoch": 0.157072, + "grad_norm": 0.59375, + "learning_rate": 8.49741935483871e-05, + "loss": 0.2046, + "step": 9817 + }, + { + "epoch": 0.157088, + "grad_norm": 1.3828125, + "learning_rate": 8.497258064516129e-05, + "loss": 0.1853, + "step": 9818 + }, + { + "epoch": 0.157104, + "grad_norm": 0.6796875, + "learning_rate": 8.497096774193549e-05, + "loss": 0.1346, + "step": 9819 + }, + { + "epoch": 0.15712, + "grad_norm": 0.55859375, + "learning_rate": 8.496935483870968e-05, + "loss": 0.163, + "step": 9820 + }, + { + "epoch": 0.157136, + "grad_norm": 0.8359375, + "learning_rate": 8.496774193548388e-05, + "loss": 0.195, + "step": 9821 + }, + { + "epoch": 0.157152, + "grad_norm": 0.71484375, + "learning_rate": 8.496612903225806e-05, + "loss": 0.1568, + "step": 9822 + }, + { + "epoch": 0.157168, + "grad_norm": 0.9609375, + "learning_rate": 8.496451612903226e-05, + "loss": 0.2109, + "step": 9823 + }, + { + "epoch": 0.157184, + "grad_norm": 0.83203125, + "learning_rate": 8.496290322580646e-05, + "loss": 0.2136, + "step": 9824 + }, + { + "epoch": 0.1572, + "grad_norm": 1.234375, + "learning_rate": 8.496129032258065e-05, + "loss": 0.165, + "step": 9825 + }, + { + "epoch": 0.157216, + "grad_norm": 0.859375, + "learning_rate": 8.495967741935485e-05, + "loss": 0.1321, + "step": 9826 + }, + { + "epoch": 0.157232, + "grad_norm": 0.78125, + "learning_rate": 8.495806451612903e-05, + "loss": 0.2078, + "step": 9827 + }, + { + "epoch": 0.157248, + "grad_norm": 0.609375, + "learning_rate": 8.495645161290323e-05, + "loss": 0.1688, + "step": 9828 + }, + { + "epoch": 0.157264, + "grad_norm": 0.625, + "learning_rate": 8.495483870967742e-05, + "loss": 0.1525, + "step": 9829 + }, + { + "epoch": 0.15728, + "grad_norm": 0.875, + "learning_rate": 8.495322580645162e-05, + "loss": 0.2195, + "step": 9830 + }, + { + "epoch": 0.157296, + "grad_norm": 0.482421875, + "learning_rate": 8.49516129032258e-05, + "loss": 0.1677, + "step": 9831 + }, + { + "epoch": 0.157312, + "grad_norm": 0.52734375, + "learning_rate": 8.495e-05, + "loss": 0.1651, + "step": 9832 + }, + { + "epoch": 0.157328, + "grad_norm": 0.474609375, + "learning_rate": 8.494838709677419e-05, + "loss": 0.143, + "step": 9833 + }, + { + "epoch": 0.157344, + "grad_norm": 0.63671875, + "learning_rate": 8.494677419354839e-05, + "loss": 0.1718, + "step": 9834 + }, + { + "epoch": 0.15736, + "grad_norm": 1.109375, + "learning_rate": 8.494516129032259e-05, + "loss": 0.1832, + "step": 9835 + }, + { + "epoch": 0.157376, + "grad_norm": 0.80859375, + "learning_rate": 8.494354838709678e-05, + "loss": 0.2117, + "step": 9836 + }, + { + "epoch": 0.157392, + "grad_norm": 0.6328125, + "learning_rate": 8.494193548387098e-05, + "loss": 0.1623, + "step": 9837 + }, + { + "epoch": 0.157408, + "grad_norm": 1.046875, + "learning_rate": 8.494032258064518e-05, + "loss": 0.1903, + "step": 9838 + }, + { + "epoch": 0.157424, + "grad_norm": 0.61328125, + "learning_rate": 8.493870967741936e-05, + "loss": 0.1833, + "step": 9839 + }, + { + "epoch": 0.15744, + "grad_norm": 0.8671875, + "learning_rate": 8.493709677419355e-05, + "loss": 0.1907, + "step": 9840 + }, + { + "epoch": 0.157456, + "grad_norm": 0.66015625, + "learning_rate": 8.493548387096775e-05, + "loss": 0.1733, + "step": 9841 + }, + { + "epoch": 0.157472, + "grad_norm": 1.28125, + "learning_rate": 8.493387096774193e-05, + "loss": 0.1851, + "step": 9842 + }, + { + "epoch": 0.157488, + "grad_norm": 0.56640625, + "learning_rate": 8.493225806451613e-05, + "loss": 0.1483, + "step": 9843 + }, + { + "epoch": 0.157504, + "grad_norm": 0.6484375, + "learning_rate": 8.493064516129032e-05, + "loss": 0.1764, + "step": 9844 + }, + { + "epoch": 0.15752, + "grad_norm": 0.7890625, + "learning_rate": 8.492903225806452e-05, + "loss": 0.2074, + "step": 9845 + }, + { + "epoch": 0.157536, + "grad_norm": 0.6171875, + "learning_rate": 8.49274193548387e-05, + "loss": 0.1537, + "step": 9846 + }, + { + "epoch": 0.157552, + "grad_norm": 0.7421875, + "learning_rate": 8.49258064516129e-05, + "loss": 0.1799, + "step": 9847 + }, + { + "epoch": 0.157568, + "grad_norm": 0.76953125, + "learning_rate": 8.49241935483871e-05, + "loss": 0.1857, + "step": 9848 + }, + { + "epoch": 0.157584, + "grad_norm": 0.63671875, + "learning_rate": 8.49225806451613e-05, + "loss": 0.1671, + "step": 9849 + }, + { + "epoch": 0.1576, + "grad_norm": 0.55859375, + "learning_rate": 8.492096774193549e-05, + "loss": 0.1439, + "step": 9850 + }, + { + "epoch": 0.157616, + "grad_norm": 0.9375, + "learning_rate": 8.491935483870969e-05, + "loss": 0.182, + "step": 9851 + }, + { + "epoch": 0.157632, + "grad_norm": 0.69921875, + "learning_rate": 8.491774193548388e-05, + "loss": 0.181, + "step": 9852 + }, + { + "epoch": 0.157648, + "grad_norm": 0.78125, + "learning_rate": 8.491612903225808e-05, + "loss": 0.1832, + "step": 9853 + }, + { + "epoch": 0.157664, + "grad_norm": 0.7890625, + "learning_rate": 8.491451612903226e-05, + "loss": 0.1635, + "step": 9854 + }, + { + "epoch": 0.15768, + "grad_norm": 1.296875, + "learning_rate": 8.491290322580645e-05, + "loss": 0.168, + "step": 9855 + }, + { + "epoch": 0.157696, + "grad_norm": 0.96875, + "learning_rate": 8.491129032258065e-05, + "loss": 0.1574, + "step": 9856 + }, + { + "epoch": 0.157712, + "grad_norm": 0.8984375, + "learning_rate": 8.490967741935483e-05, + "loss": 0.189, + "step": 9857 + }, + { + "epoch": 0.157728, + "grad_norm": 0.7109375, + "learning_rate": 8.490806451612903e-05, + "loss": 0.2058, + "step": 9858 + }, + { + "epoch": 0.157744, + "grad_norm": 0.96484375, + "learning_rate": 8.490645161290323e-05, + "loss": 0.1703, + "step": 9859 + }, + { + "epoch": 0.15776, + "grad_norm": 0.640625, + "learning_rate": 8.490483870967743e-05, + "loss": 0.1857, + "step": 9860 + }, + { + "epoch": 0.157776, + "grad_norm": 0.6796875, + "learning_rate": 8.490322580645162e-05, + "loss": 0.1961, + "step": 9861 + }, + { + "epoch": 0.157792, + "grad_norm": 1.390625, + "learning_rate": 8.490161290322582e-05, + "loss": 0.1957, + "step": 9862 + }, + { + "epoch": 0.157808, + "grad_norm": 0.890625, + "learning_rate": 8.49e-05, + "loss": 0.1903, + "step": 9863 + }, + { + "epoch": 0.157824, + "grad_norm": 0.69921875, + "learning_rate": 8.48983870967742e-05, + "loss": 0.1672, + "step": 9864 + }, + { + "epoch": 0.15784, + "grad_norm": 1.078125, + "learning_rate": 8.489677419354839e-05, + "loss": 0.1703, + "step": 9865 + }, + { + "epoch": 0.157856, + "grad_norm": 0.5625, + "learning_rate": 8.489516129032259e-05, + "loss": 0.1522, + "step": 9866 + }, + { + "epoch": 0.157872, + "grad_norm": 0.828125, + "learning_rate": 8.489354838709678e-05, + "loss": 0.1819, + "step": 9867 + }, + { + "epoch": 0.157888, + "grad_norm": 1.15625, + "learning_rate": 8.489193548387097e-05, + "loss": 0.1624, + "step": 9868 + }, + { + "epoch": 0.157904, + "grad_norm": 0.98828125, + "learning_rate": 8.489032258064516e-05, + "loss": 0.1589, + "step": 9869 + }, + { + "epoch": 0.15792, + "grad_norm": 0.69921875, + "learning_rate": 8.488870967741935e-05, + "loss": 0.1658, + "step": 9870 + }, + { + "epoch": 0.157936, + "grad_norm": 0.61328125, + "learning_rate": 8.488709677419355e-05, + "loss": 0.1868, + "step": 9871 + }, + { + "epoch": 0.157952, + "grad_norm": 0.62109375, + "learning_rate": 8.488548387096775e-05, + "loss": 0.1527, + "step": 9872 + }, + { + "epoch": 0.157968, + "grad_norm": 0.80859375, + "learning_rate": 8.488387096774195e-05, + "loss": 0.1833, + "step": 9873 + }, + { + "epoch": 0.157984, + "grad_norm": 0.9609375, + "learning_rate": 8.488225806451613e-05, + "loss": 0.1641, + "step": 9874 + }, + { + "epoch": 0.158, + "grad_norm": 0.625, + "learning_rate": 8.488064516129033e-05, + "loss": 0.1595, + "step": 9875 + }, + { + "epoch": 0.158016, + "grad_norm": 1.1484375, + "learning_rate": 8.487903225806452e-05, + "loss": 0.1498, + "step": 9876 + }, + { + "epoch": 0.158032, + "grad_norm": 1.3984375, + "learning_rate": 8.487741935483872e-05, + "loss": 0.1999, + "step": 9877 + }, + { + "epoch": 0.158048, + "grad_norm": 0.6015625, + "learning_rate": 8.48758064516129e-05, + "loss": 0.1641, + "step": 9878 + }, + { + "epoch": 0.158064, + "grad_norm": 0.91796875, + "learning_rate": 8.48741935483871e-05, + "loss": 0.1988, + "step": 9879 + }, + { + "epoch": 0.15808, + "grad_norm": 0.74609375, + "learning_rate": 8.487258064516129e-05, + "loss": 0.1327, + "step": 9880 + }, + { + "epoch": 0.158096, + "grad_norm": 0.625, + "learning_rate": 8.487096774193549e-05, + "loss": 0.1702, + "step": 9881 + }, + { + "epoch": 0.158112, + "grad_norm": 0.6640625, + "learning_rate": 8.486935483870967e-05, + "loss": 0.1555, + "step": 9882 + }, + { + "epoch": 0.158128, + "grad_norm": 0.79296875, + "learning_rate": 8.486774193548387e-05, + "loss": 0.2021, + "step": 9883 + }, + { + "epoch": 0.158144, + "grad_norm": 0.9140625, + "learning_rate": 8.486612903225807e-05, + "loss": 0.1853, + "step": 9884 + }, + { + "epoch": 0.15816, + "grad_norm": 0.98046875, + "learning_rate": 8.486451612903227e-05, + "loss": 0.1643, + "step": 9885 + }, + { + "epoch": 0.158176, + "grad_norm": 1.2109375, + "learning_rate": 8.486290322580646e-05, + "loss": 0.2236, + "step": 9886 + }, + { + "epoch": 0.158192, + "grad_norm": 0.80859375, + "learning_rate": 8.486129032258065e-05, + "loss": 0.2042, + "step": 9887 + }, + { + "epoch": 0.158208, + "grad_norm": 0.6953125, + "learning_rate": 8.485967741935485e-05, + "loss": 0.1597, + "step": 9888 + }, + { + "epoch": 0.158224, + "grad_norm": 0.9296875, + "learning_rate": 8.485806451612903e-05, + "loss": 0.171, + "step": 9889 + }, + { + "epoch": 0.15824, + "grad_norm": 0.69140625, + "learning_rate": 8.485645161290323e-05, + "loss": 0.1727, + "step": 9890 + }, + { + "epoch": 0.158256, + "grad_norm": 1.0390625, + "learning_rate": 8.485483870967742e-05, + "loss": 0.1616, + "step": 9891 + }, + { + "epoch": 0.158272, + "grad_norm": 1.1015625, + "learning_rate": 8.485322580645162e-05, + "loss": 0.1868, + "step": 9892 + }, + { + "epoch": 0.158288, + "grad_norm": 0.84375, + "learning_rate": 8.48516129032258e-05, + "loss": 0.1649, + "step": 9893 + }, + { + "epoch": 0.158304, + "grad_norm": 0.84765625, + "learning_rate": 8.485e-05, + "loss": 0.2231, + "step": 9894 + }, + { + "epoch": 0.15832, + "grad_norm": 0.6484375, + "learning_rate": 8.48483870967742e-05, + "loss": 0.1895, + "step": 9895 + }, + { + "epoch": 0.158336, + "grad_norm": 1.078125, + "learning_rate": 8.48467741935484e-05, + "loss": 0.1601, + "step": 9896 + }, + { + "epoch": 0.158352, + "grad_norm": 1.0234375, + "learning_rate": 8.484516129032259e-05, + "loss": 0.1762, + "step": 9897 + }, + { + "epoch": 0.158368, + "grad_norm": 0.8359375, + "learning_rate": 8.484354838709679e-05, + "loss": 0.1834, + "step": 9898 + }, + { + "epoch": 0.158384, + "grad_norm": 0.80078125, + "learning_rate": 8.484193548387097e-05, + "loss": 0.1742, + "step": 9899 + }, + { + "epoch": 0.1584, + "grad_norm": 1.0, + "learning_rate": 8.484032258064517e-05, + "loss": 0.1741, + "step": 9900 + }, + { + "epoch": 0.158416, + "grad_norm": 0.72265625, + "learning_rate": 8.483870967741936e-05, + "loss": 0.1898, + "step": 9901 + }, + { + "epoch": 0.158432, + "grad_norm": 0.84765625, + "learning_rate": 8.483709677419355e-05, + "loss": 0.2124, + "step": 9902 + }, + { + "epoch": 0.158448, + "grad_norm": 1.1328125, + "learning_rate": 8.483548387096774e-05, + "loss": 0.202, + "step": 9903 + }, + { + "epoch": 0.158464, + "grad_norm": 0.609375, + "learning_rate": 8.483387096774193e-05, + "loss": 0.1613, + "step": 9904 + }, + { + "epoch": 0.15848, + "grad_norm": 0.76953125, + "learning_rate": 8.483225806451613e-05, + "loss": 0.185, + "step": 9905 + }, + { + "epoch": 0.158496, + "grad_norm": 0.6171875, + "learning_rate": 8.483064516129032e-05, + "loss": 0.1231, + "step": 9906 + }, + { + "epoch": 0.158512, + "grad_norm": 0.81640625, + "learning_rate": 8.482903225806452e-05, + "loss": 0.1801, + "step": 9907 + }, + { + "epoch": 0.158528, + "grad_norm": 0.74609375, + "learning_rate": 8.482741935483872e-05, + "loss": 0.1788, + "step": 9908 + }, + { + "epoch": 0.158544, + "grad_norm": 0.64453125, + "learning_rate": 8.482580645161292e-05, + "loss": 0.1928, + "step": 9909 + }, + { + "epoch": 0.15856, + "grad_norm": 0.76953125, + "learning_rate": 8.48241935483871e-05, + "loss": 0.235, + "step": 9910 + }, + { + "epoch": 0.158576, + "grad_norm": 0.89453125, + "learning_rate": 8.48225806451613e-05, + "loss": 0.1836, + "step": 9911 + }, + { + "epoch": 0.158592, + "grad_norm": 0.69140625, + "learning_rate": 8.482096774193549e-05, + "loss": 0.1543, + "step": 9912 + }, + { + "epoch": 0.158608, + "grad_norm": 0.8125, + "learning_rate": 8.481935483870969e-05, + "loss": 0.1752, + "step": 9913 + }, + { + "epoch": 0.158624, + "grad_norm": 0.80078125, + "learning_rate": 8.481774193548387e-05, + "loss": 0.1771, + "step": 9914 + }, + { + "epoch": 0.15864, + "grad_norm": 0.70703125, + "learning_rate": 8.481612903225807e-05, + "loss": 0.175, + "step": 9915 + }, + { + "epoch": 0.158656, + "grad_norm": 0.88671875, + "learning_rate": 8.481451612903226e-05, + "loss": 0.1738, + "step": 9916 + }, + { + "epoch": 0.158672, + "grad_norm": 0.66796875, + "learning_rate": 8.481290322580644e-05, + "loss": 0.1891, + "step": 9917 + }, + { + "epoch": 0.158688, + "grad_norm": 0.68359375, + "learning_rate": 8.481129032258064e-05, + "loss": 0.1731, + "step": 9918 + }, + { + "epoch": 0.158704, + "grad_norm": 0.859375, + "learning_rate": 8.480967741935484e-05, + "loss": 0.2014, + "step": 9919 + }, + { + "epoch": 0.15872, + "grad_norm": 0.7890625, + "learning_rate": 8.480806451612904e-05, + "loss": 0.224, + "step": 9920 + }, + { + "epoch": 0.158736, + "grad_norm": 1.0546875, + "learning_rate": 8.480645161290323e-05, + "loss": 0.2052, + "step": 9921 + }, + { + "epoch": 0.158752, + "grad_norm": 0.75390625, + "learning_rate": 8.480483870967743e-05, + "loss": 0.159, + "step": 9922 + }, + { + "epoch": 0.158768, + "grad_norm": 0.640625, + "learning_rate": 8.480322580645162e-05, + "loss": 0.1771, + "step": 9923 + }, + { + "epoch": 0.158784, + "grad_norm": 0.55859375, + "learning_rate": 8.480161290322582e-05, + "loss": 0.1555, + "step": 9924 + }, + { + "epoch": 0.1588, + "grad_norm": 0.63671875, + "learning_rate": 8.48e-05, + "loss": 0.1261, + "step": 9925 + }, + { + "epoch": 0.158816, + "grad_norm": 1.109375, + "learning_rate": 8.47983870967742e-05, + "loss": 0.1776, + "step": 9926 + }, + { + "epoch": 0.158832, + "grad_norm": 1.21875, + "learning_rate": 8.479677419354839e-05, + "loss": 0.1517, + "step": 9927 + }, + { + "epoch": 0.158848, + "grad_norm": 0.984375, + "learning_rate": 8.479516129032259e-05, + "loss": 0.1574, + "step": 9928 + }, + { + "epoch": 0.158864, + "grad_norm": 0.55859375, + "learning_rate": 8.479354838709677e-05, + "loss": 0.1696, + "step": 9929 + }, + { + "epoch": 0.15888, + "grad_norm": 0.70703125, + "learning_rate": 8.479193548387097e-05, + "loss": 0.185, + "step": 9930 + }, + { + "epoch": 0.158896, + "grad_norm": 1.3125, + "learning_rate": 8.479032258064516e-05, + "loss": 0.1834, + "step": 9931 + }, + { + "epoch": 0.158912, + "grad_norm": 0.68359375, + "learning_rate": 8.478870967741936e-05, + "loss": 0.2181, + "step": 9932 + }, + { + "epoch": 0.158928, + "grad_norm": 0.625, + "learning_rate": 8.478709677419356e-05, + "loss": 0.158, + "step": 9933 + }, + { + "epoch": 0.158944, + "grad_norm": 1.0234375, + "learning_rate": 8.478548387096774e-05, + "loss": 0.1811, + "step": 9934 + }, + { + "epoch": 0.15896, + "grad_norm": 0.55078125, + "learning_rate": 8.478387096774194e-05, + "loss": 0.1813, + "step": 9935 + }, + { + "epoch": 0.158976, + "grad_norm": 0.97265625, + "learning_rate": 8.478225806451613e-05, + "loss": 0.1775, + "step": 9936 + }, + { + "epoch": 0.158992, + "grad_norm": 1.25, + "learning_rate": 8.478064516129033e-05, + "loss": 0.1635, + "step": 9937 + }, + { + "epoch": 0.159008, + "grad_norm": 0.98828125, + "learning_rate": 8.477903225806452e-05, + "loss": 0.1425, + "step": 9938 + }, + { + "epoch": 0.159024, + "grad_norm": 0.73828125, + "learning_rate": 8.477741935483871e-05, + "loss": 0.1856, + "step": 9939 + }, + { + "epoch": 0.15904, + "grad_norm": 0.86328125, + "learning_rate": 8.47758064516129e-05, + "loss": 0.1618, + "step": 9940 + }, + { + "epoch": 0.159056, + "grad_norm": 0.61328125, + "learning_rate": 8.47741935483871e-05, + "loss": 0.1781, + "step": 9941 + }, + { + "epoch": 0.159072, + "grad_norm": 1.109375, + "learning_rate": 8.477258064516129e-05, + "loss": 0.2148, + "step": 9942 + }, + { + "epoch": 0.159088, + "grad_norm": 0.98828125, + "learning_rate": 8.477096774193549e-05, + "loss": 0.1825, + "step": 9943 + }, + { + "epoch": 0.159104, + "grad_norm": 0.78515625, + "learning_rate": 8.476935483870969e-05, + "loss": 0.192, + "step": 9944 + }, + { + "epoch": 0.15912, + "grad_norm": 0.60546875, + "learning_rate": 8.476774193548389e-05, + "loss": 0.1646, + "step": 9945 + }, + { + "epoch": 0.159136, + "grad_norm": 0.7734375, + "learning_rate": 8.476612903225807e-05, + "loss": 0.2033, + "step": 9946 + }, + { + "epoch": 0.159152, + "grad_norm": 0.625, + "learning_rate": 8.476451612903227e-05, + "loss": 0.1484, + "step": 9947 + }, + { + "epoch": 0.159168, + "grad_norm": 0.87890625, + "learning_rate": 8.476290322580646e-05, + "loss": 0.1778, + "step": 9948 + }, + { + "epoch": 0.159184, + "grad_norm": 1.125, + "learning_rate": 8.476129032258064e-05, + "loss": 0.1794, + "step": 9949 + }, + { + "epoch": 0.1592, + "grad_norm": 1.3359375, + "learning_rate": 8.475967741935484e-05, + "loss": 0.1562, + "step": 9950 + }, + { + "epoch": 0.159216, + "grad_norm": 0.8359375, + "learning_rate": 8.475806451612903e-05, + "loss": 0.1838, + "step": 9951 + }, + { + "epoch": 0.159232, + "grad_norm": 0.97265625, + "learning_rate": 8.475645161290323e-05, + "loss": 0.2122, + "step": 9952 + }, + { + "epoch": 0.159248, + "grad_norm": 0.84765625, + "learning_rate": 8.475483870967741e-05, + "loss": 0.1727, + "step": 9953 + }, + { + "epoch": 0.159264, + "grad_norm": 1.0625, + "learning_rate": 8.475322580645161e-05, + "loss": 0.1817, + "step": 9954 + }, + { + "epoch": 0.15928, + "grad_norm": 0.67578125, + "learning_rate": 8.475161290322581e-05, + "loss": 0.1519, + "step": 9955 + }, + { + "epoch": 0.159296, + "grad_norm": 0.765625, + "learning_rate": 8.475000000000001e-05, + "loss": 0.1737, + "step": 9956 + }, + { + "epoch": 0.159312, + "grad_norm": 0.921875, + "learning_rate": 8.47483870967742e-05, + "loss": 0.1847, + "step": 9957 + }, + { + "epoch": 0.159328, + "grad_norm": 0.859375, + "learning_rate": 8.47467741935484e-05, + "loss": 0.1862, + "step": 9958 + }, + { + "epoch": 0.159344, + "grad_norm": 0.578125, + "learning_rate": 8.474516129032259e-05, + "loss": 0.1623, + "step": 9959 + }, + { + "epoch": 0.15936, + "grad_norm": 0.55859375, + "learning_rate": 8.474354838709678e-05, + "loss": 0.1328, + "step": 9960 + }, + { + "epoch": 0.159376, + "grad_norm": 0.59765625, + "learning_rate": 8.474193548387097e-05, + "loss": 0.176, + "step": 9961 + }, + { + "epoch": 0.159392, + "grad_norm": 0.6640625, + "learning_rate": 8.474032258064517e-05, + "loss": 0.1835, + "step": 9962 + }, + { + "epoch": 0.159408, + "grad_norm": 1.34375, + "learning_rate": 8.473870967741936e-05, + "loss": 0.1931, + "step": 9963 + }, + { + "epoch": 0.159424, + "grad_norm": 1.09375, + "learning_rate": 8.473709677419354e-05, + "loss": 0.2107, + "step": 9964 + }, + { + "epoch": 0.15944, + "grad_norm": 0.76171875, + "learning_rate": 8.473548387096774e-05, + "loss": 0.1828, + "step": 9965 + }, + { + "epoch": 0.159456, + "grad_norm": 1.5390625, + "learning_rate": 8.473387096774193e-05, + "loss": 0.1877, + "step": 9966 + }, + { + "epoch": 0.159472, + "grad_norm": 0.65234375, + "learning_rate": 8.473225806451613e-05, + "loss": 0.1795, + "step": 9967 + }, + { + "epoch": 0.159488, + "grad_norm": 1.484375, + "learning_rate": 8.473064516129033e-05, + "loss": 0.2221, + "step": 9968 + }, + { + "epoch": 0.159504, + "grad_norm": 1.25, + "learning_rate": 8.472903225806453e-05, + "loss": 0.2102, + "step": 9969 + }, + { + "epoch": 0.15952, + "grad_norm": 0.921875, + "learning_rate": 8.472741935483871e-05, + "loss": 0.2197, + "step": 9970 + }, + { + "epoch": 0.159536, + "grad_norm": 0.94140625, + "learning_rate": 8.472580645161291e-05, + "loss": 0.1891, + "step": 9971 + }, + { + "epoch": 0.159552, + "grad_norm": 1.1796875, + "learning_rate": 8.47241935483871e-05, + "loss": 0.1592, + "step": 9972 + }, + { + "epoch": 0.159568, + "grad_norm": 0.8671875, + "learning_rate": 8.47225806451613e-05, + "loss": 0.1806, + "step": 9973 + }, + { + "epoch": 0.159584, + "grad_norm": 0.65625, + "learning_rate": 8.472096774193548e-05, + "loss": 0.219, + "step": 9974 + }, + { + "epoch": 0.1596, + "grad_norm": 0.7109375, + "learning_rate": 8.471935483870968e-05, + "loss": 0.1412, + "step": 9975 + }, + { + "epoch": 0.159616, + "grad_norm": 0.7421875, + "learning_rate": 8.471774193548387e-05, + "loss": 0.2158, + "step": 9976 + }, + { + "epoch": 0.159632, + "grad_norm": 0.76953125, + "learning_rate": 8.471612903225807e-05, + "loss": 0.2148, + "step": 9977 + }, + { + "epoch": 0.159648, + "grad_norm": 0.734375, + "learning_rate": 8.471451612903226e-05, + "loss": 0.1761, + "step": 9978 + }, + { + "epoch": 0.159664, + "grad_norm": 0.58984375, + "learning_rate": 8.471290322580646e-05, + "loss": 0.1866, + "step": 9979 + }, + { + "epoch": 0.15968, + "grad_norm": 1.0390625, + "learning_rate": 8.471129032258066e-05, + "loss": 0.1909, + "step": 9980 + }, + { + "epoch": 0.159696, + "grad_norm": 0.6640625, + "learning_rate": 8.470967741935484e-05, + "loss": 0.1926, + "step": 9981 + }, + { + "epoch": 0.159712, + "grad_norm": 0.7578125, + "learning_rate": 8.470806451612904e-05, + "loss": 0.1859, + "step": 9982 + }, + { + "epoch": 0.159728, + "grad_norm": 0.90234375, + "learning_rate": 8.470645161290323e-05, + "loss": 0.1779, + "step": 9983 + }, + { + "epoch": 0.159744, + "grad_norm": 0.8203125, + "learning_rate": 8.470483870967743e-05, + "loss": 0.1924, + "step": 9984 + }, + { + "epoch": 0.15976, + "grad_norm": 0.6328125, + "learning_rate": 8.470322580645161e-05, + "loss": 0.1887, + "step": 9985 + }, + { + "epoch": 0.159776, + "grad_norm": 0.61328125, + "learning_rate": 8.470161290322581e-05, + "loss": 0.1576, + "step": 9986 + }, + { + "epoch": 0.159792, + "grad_norm": 0.66796875, + "learning_rate": 8.47e-05, + "loss": 0.1616, + "step": 9987 + }, + { + "epoch": 0.159808, + "grad_norm": 0.87109375, + "learning_rate": 8.46983870967742e-05, + "loss": 0.1961, + "step": 9988 + }, + { + "epoch": 0.159824, + "grad_norm": 0.5078125, + "learning_rate": 8.469677419354838e-05, + "loss": 0.1271, + "step": 9989 + }, + { + "epoch": 0.15984, + "grad_norm": 0.703125, + "learning_rate": 8.469516129032258e-05, + "loss": 0.2011, + "step": 9990 + }, + { + "epoch": 0.159856, + "grad_norm": 1.1015625, + "learning_rate": 8.469354838709678e-05, + "loss": 0.1812, + "step": 9991 + }, + { + "epoch": 0.159872, + "grad_norm": 0.94140625, + "learning_rate": 8.469193548387098e-05, + "loss": 0.1784, + "step": 9992 + }, + { + "epoch": 0.159888, + "grad_norm": 0.69921875, + "learning_rate": 8.469032258064517e-05, + "loss": 0.1955, + "step": 9993 + }, + { + "epoch": 0.159904, + "grad_norm": 0.578125, + "learning_rate": 8.468870967741936e-05, + "loss": 0.1454, + "step": 9994 + }, + { + "epoch": 0.15992, + "grad_norm": 0.6640625, + "learning_rate": 8.468709677419356e-05, + "loss": 0.1489, + "step": 9995 + }, + { + "epoch": 0.159936, + "grad_norm": 0.77734375, + "learning_rate": 8.468548387096774e-05, + "loss": 0.1969, + "step": 9996 + }, + { + "epoch": 0.159952, + "grad_norm": 0.9609375, + "learning_rate": 8.468387096774194e-05, + "loss": 0.18, + "step": 9997 + }, + { + "epoch": 0.159968, + "grad_norm": 0.86328125, + "learning_rate": 8.468225806451613e-05, + "loss": 0.1922, + "step": 9998 + }, + { + "epoch": 0.159984, + "grad_norm": 0.63671875, + "learning_rate": 8.468064516129033e-05, + "loss": 0.1384, + "step": 9999 + }, + { + "epoch": 0.16, + "grad_norm": 0.80078125, + "learning_rate": 8.467903225806451e-05, + "loss": 0.1513, + "step": 10000 + }, + { + "epoch": 0.160016, + "grad_norm": 1.203125, + "learning_rate": 8.467741935483871e-05, + "loss": 0.2192, + "step": 10001 + }, + { + "epoch": 0.160032, + "grad_norm": 1.0625, + "learning_rate": 8.46758064516129e-05, + "loss": 0.1666, + "step": 10002 + }, + { + "epoch": 0.160048, + "grad_norm": 0.80859375, + "learning_rate": 8.46741935483871e-05, + "loss": 0.163, + "step": 10003 + }, + { + "epoch": 0.160064, + "grad_norm": 0.71875, + "learning_rate": 8.46725806451613e-05, + "loss": 0.1745, + "step": 10004 + }, + { + "epoch": 0.16008, + "grad_norm": 1.25, + "learning_rate": 8.46709677419355e-05, + "loss": 0.1664, + "step": 10005 + }, + { + "epoch": 0.160096, + "grad_norm": 0.8125, + "learning_rate": 8.466935483870968e-05, + "loss": 0.1792, + "step": 10006 + }, + { + "epoch": 0.160112, + "grad_norm": 0.62109375, + "learning_rate": 8.466774193548388e-05, + "loss": 0.1854, + "step": 10007 + }, + { + "epoch": 0.160128, + "grad_norm": 1.1796875, + "learning_rate": 8.466612903225807e-05, + "loss": 0.224, + "step": 10008 + }, + { + "epoch": 0.160144, + "grad_norm": 0.875, + "learning_rate": 8.466451612903227e-05, + "loss": 0.1796, + "step": 10009 + }, + { + "epoch": 0.16016, + "grad_norm": 0.75390625, + "learning_rate": 8.466290322580645e-05, + "loss": 0.1587, + "step": 10010 + }, + { + "epoch": 0.160176, + "grad_norm": 1.140625, + "learning_rate": 8.466129032258064e-05, + "loss": 0.1471, + "step": 10011 + }, + { + "epoch": 0.160192, + "grad_norm": 0.73046875, + "learning_rate": 8.465967741935484e-05, + "loss": 0.1549, + "step": 10012 + }, + { + "epoch": 0.160208, + "grad_norm": 0.921875, + "learning_rate": 8.465806451612903e-05, + "loss": 0.15, + "step": 10013 + }, + { + "epoch": 0.160224, + "grad_norm": 0.890625, + "learning_rate": 8.465645161290323e-05, + "loss": 0.1419, + "step": 10014 + }, + { + "epoch": 0.16024, + "grad_norm": 0.6640625, + "learning_rate": 8.465483870967743e-05, + "loss": 0.1734, + "step": 10015 + }, + { + "epoch": 0.160256, + "grad_norm": 1.0234375, + "learning_rate": 8.465322580645163e-05, + "loss": 0.2227, + "step": 10016 + }, + { + "epoch": 0.160272, + "grad_norm": 0.66015625, + "learning_rate": 8.465161290322581e-05, + "loss": 0.168, + "step": 10017 + }, + { + "epoch": 0.160288, + "grad_norm": 0.6640625, + "learning_rate": 8.465000000000001e-05, + "loss": 0.1786, + "step": 10018 + }, + { + "epoch": 0.160304, + "grad_norm": 0.78515625, + "learning_rate": 8.46483870967742e-05, + "loss": 0.1992, + "step": 10019 + }, + { + "epoch": 0.16032, + "grad_norm": 0.80078125, + "learning_rate": 8.46467741935484e-05, + "loss": 0.1346, + "step": 10020 + }, + { + "epoch": 0.160336, + "grad_norm": 0.796875, + "learning_rate": 8.464516129032258e-05, + "loss": 0.1665, + "step": 10021 + }, + { + "epoch": 0.160352, + "grad_norm": 0.59375, + "learning_rate": 8.464354838709678e-05, + "loss": 0.1648, + "step": 10022 + }, + { + "epoch": 0.160368, + "grad_norm": 0.85546875, + "learning_rate": 8.464193548387097e-05, + "loss": 0.1606, + "step": 10023 + }, + { + "epoch": 0.160384, + "grad_norm": 0.76171875, + "learning_rate": 8.464032258064517e-05, + "loss": 0.1671, + "step": 10024 + }, + { + "epoch": 0.1604, + "grad_norm": 0.703125, + "learning_rate": 8.463870967741935e-05, + "loss": 0.1958, + "step": 10025 + }, + { + "epoch": 0.160416, + "grad_norm": 0.55859375, + "learning_rate": 8.463709677419355e-05, + "loss": 0.1497, + "step": 10026 + }, + { + "epoch": 0.160432, + "grad_norm": 0.87109375, + "learning_rate": 8.463548387096774e-05, + "loss": 0.1521, + "step": 10027 + }, + { + "epoch": 0.160448, + "grad_norm": 0.52734375, + "learning_rate": 8.463387096774194e-05, + "loss": 0.1691, + "step": 10028 + }, + { + "epoch": 0.160464, + "grad_norm": 1.640625, + "learning_rate": 8.463225806451614e-05, + "loss": 0.2089, + "step": 10029 + }, + { + "epoch": 0.16048, + "grad_norm": 1.109375, + "learning_rate": 8.463064516129033e-05, + "loss": 0.1751, + "step": 10030 + }, + { + "epoch": 0.160496, + "grad_norm": 0.859375, + "learning_rate": 8.462903225806453e-05, + "loss": 0.176, + "step": 10031 + }, + { + "epoch": 0.160512, + "grad_norm": 0.703125, + "learning_rate": 8.462741935483871e-05, + "loss": 0.1556, + "step": 10032 + }, + { + "epoch": 0.160528, + "grad_norm": 0.80078125, + "learning_rate": 8.462580645161291e-05, + "loss": 0.2183, + "step": 10033 + }, + { + "epoch": 0.160544, + "grad_norm": 0.84765625, + "learning_rate": 8.46241935483871e-05, + "loss": 0.1296, + "step": 10034 + }, + { + "epoch": 0.16056, + "grad_norm": 0.6953125, + "learning_rate": 8.46225806451613e-05, + "loss": 0.1619, + "step": 10035 + }, + { + "epoch": 0.160576, + "grad_norm": 0.546875, + "learning_rate": 8.462096774193548e-05, + "loss": 0.1927, + "step": 10036 + }, + { + "epoch": 0.160592, + "grad_norm": 0.79296875, + "learning_rate": 8.461935483870968e-05, + "loss": 0.1181, + "step": 10037 + }, + { + "epoch": 0.160608, + "grad_norm": 0.64453125, + "learning_rate": 8.461774193548387e-05, + "loss": 0.1768, + "step": 10038 + }, + { + "epoch": 0.160624, + "grad_norm": 0.6796875, + "learning_rate": 8.461612903225807e-05, + "loss": 0.1829, + "step": 10039 + }, + { + "epoch": 0.16064, + "grad_norm": 0.7109375, + "learning_rate": 8.461451612903227e-05, + "loss": 0.1679, + "step": 10040 + }, + { + "epoch": 0.160656, + "grad_norm": 0.68359375, + "learning_rate": 8.461290322580645e-05, + "loss": 0.1607, + "step": 10041 + }, + { + "epoch": 0.160672, + "grad_norm": 1.140625, + "learning_rate": 8.461129032258065e-05, + "loss": 0.1901, + "step": 10042 + }, + { + "epoch": 0.160688, + "grad_norm": 1.3671875, + "learning_rate": 8.460967741935484e-05, + "loss": 0.1857, + "step": 10043 + }, + { + "epoch": 0.160704, + "grad_norm": 0.83203125, + "learning_rate": 8.460806451612904e-05, + "loss": 0.1899, + "step": 10044 + }, + { + "epoch": 0.16072, + "grad_norm": 0.69140625, + "learning_rate": 8.460645161290322e-05, + "loss": 0.183, + "step": 10045 + }, + { + "epoch": 0.160736, + "grad_norm": 1.1171875, + "learning_rate": 8.460483870967742e-05, + "loss": 0.1737, + "step": 10046 + }, + { + "epoch": 0.160752, + "grad_norm": 1.484375, + "learning_rate": 8.460322580645161e-05, + "loss": 0.1793, + "step": 10047 + }, + { + "epoch": 0.160768, + "grad_norm": 0.65625, + "learning_rate": 8.460161290322581e-05, + "loss": 0.2115, + "step": 10048 + }, + { + "epoch": 0.160784, + "grad_norm": 0.84765625, + "learning_rate": 8.46e-05, + "loss": 0.2055, + "step": 10049 + }, + { + "epoch": 0.1608, + "grad_norm": 1.1484375, + "learning_rate": 8.45983870967742e-05, + "loss": 0.2125, + "step": 10050 + }, + { + "epoch": 0.160816, + "grad_norm": 0.86328125, + "learning_rate": 8.45967741935484e-05, + "loss": 0.2065, + "step": 10051 + }, + { + "epoch": 0.160832, + "grad_norm": 1.234375, + "learning_rate": 8.45951612903226e-05, + "loss": 0.154, + "step": 10052 + }, + { + "epoch": 0.160848, + "grad_norm": 0.88671875, + "learning_rate": 8.459354838709678e-05, + "loss": 0.1707, + "step": 10053 + }, + { + "epoch": 0.160864, + "grad_norm": 0.890625, + "learning_rate": 8.459193548387098e-05, + "loss": 0.1624, + "step": 10054 + }, + { + "epoch": 0.16088, + "grad_norm": 0.8515625, + "learning_rate": 8.459032258064517e-05, + "loss": 0.1874, + "step": 10055 + }, + { + "epoch": 0.160896, + "grad_norm": 0.828125, + "learning_rate": 8.458870967741937e-05, + "loss": 0.1506, + "step": 10056 + }, + { + "epoch": 0.160912, + "grad_norm": 0.94921875, + "learning_rate": 8.458709677419355e-05, + "loss": 0.1647, + "step": 10057 + }, + { + "epoch": 0.160928, + "grad_norm": 1.1484375, + "learning_rate": 8.458548387096774e-05, + "loss": 0.1818, + "step": 10058 + }, + { + "epoch": 0.160944, + "grad_norm": 0.89453125, + "learning_rate": 8.458387096774194e-05, + "loss": 0.1777, + "step": 10059 + }, + { + "epoch": 0.16096, + "grad_norm": 0.5390625, + "learning_rate": 8.458225806451612e-05, + "loss": 0.1494, + "step": 10060 + }, + { + "epoch": 0.160976, + "grad_norm": 0.77734375, + "learning_rate": 8.458064516129032e-05, + "loss": 0.1486, + "step": 10061 + }, + { + "epoch": 0.160992, + "grad_norm": 0.5, + "learning_rate": 8.457903225806451e-05, + "loss": 0.1359, + "step": 10062 + }, + { + "epoch": 0.161008, + "grad_norm": 0.62890625, + "learning_rate": 8.457741935483871e-05, + "loss": 0.1714, + "step": 10063 + }, + { + "epoch": 0.161024, + "grad_norm": 0.921875, + "learning_rate": 8.457580645161291e-05, + "loss": 0.1499, + "step": 10064 + }, + { + "epoch": 0.16104, + "grad_norm": 0.69921875, + "learning_rate": 8.457419354838711e-05, + "loss": 0.164, + "step": 10065 + }, + { + "epoch": 0.161056, + "grad_norm": 0.66015625, + "learning_rate": 8.45725806451613e-05, + "loss": 0.1674, + "step": 10066 + }, + { + "epoch": 0.161072, + "grad_norm": 0.7265625, + "learning_rate": 8.45709677419355e-05, + "loss": 0.2042, + "step": 10067 + }, + { + "epoch": 0.161088, + "grad_norm": 0.65234375, + "learning_rate": 8.456935483870968e-05, + "loss": 0.1662, + "step": 10068 + }, + { + "epoch": 0.161104, + "grad_norm": 0.859375, + "learning_rate": 8.456774193548388e-05, + "loss": 0.134, + "step": 10069 + }, + { + "epoch": 0.16112, + "grad_norm": 0.890625, + "learning_rate": 8.456612903225807e-05, + "loss": 0.171, + "step": 10070 + }, + { + "epoch": 0.161136, + "grad_norm": 0.9609375, + "learning_rate": 8.456451612903227e-05, + "loss": 0.1659, + "step": 10071 + }, + { + "epoch": 0.161152, + "grad_norm": 0.7578125, + "learning_rate": 8.456290322580645e-05, + "loss": 0.1536, + "step": 10072 + }, + { + "epoch": 0.161168, + "grad_norm": 1.1015625, + "learning_rate": 8.456129032258064e-05, + "loss": 0.1904, + "step": 10073 + }, + { + "epoch": 0.161184, + "grad_norm": 0.46484375, + "learning_rate": 8.455967741935484e-05, + "loss": 0.1321, + "step": 10074 + }, + { + "epoch": 0.1612, + "grad_norm": 0.765625, + "learning_rate": 8.455806451612904e-05, + "loss": 0.1772, + "step": 10075 + }, + { + "epoch": 0.161216, + "grad_norm": 1.3046875, + "learning_rate": 8.455645161290324e-05, + "loss": 0.2016, + "step": 10076 + }, + { + "epoch": 0.161232, + "grad_norm": 1.1484375, + "learning_rate": 8.455483870967742e-05, + "loss": 0.1577, + "step": 10077 + }, + { + "epoch": 0.161248, + "grad_norm": 1.2265625, + "learning_rate": 8.455322580645162e-05, + "loss": 0.195, + "step": 10078 + }, + { + "epoch": 0.161264, + "grad_norm": 0.73046875, + "learning_rate": 8.455161290322581e-05, + "loss": 0.1289, + "step": 10079 + }, + { + "epoch": 0.16128, + "grad_norm": 0.8203125, + "learning_rate": 8.455000000000001e-05, + "loss": 0.2014, + "step": 10080 + }, + { + "epoch": 0.161296, + "grad_norm": 0.703125, + "learning_rate": 8.45483870967742e-05, + "loss": 0.1599, + "step": 10081 + }, + { + "epoch": 0.161312, + "grad_norm": 0.6328125, + "learning_rate": 8.45467741935484e-05, + "loss": 0.1417, + "step": 10082 + }, + { + "epoch": 0.161328, + "grad_norm": 0.953125, + "learning_rate": 8.454516129032258e-05, + "loss": 0.1499, + "step": 10083 + }, + { + "epoch": 0.161344, + "grad_norm": 0.7421875, + "learning_rate": 8.454354838709678e-05, + "loss": 0.1875, + "step": 10084 + }, + { + "epoch": 0.16136, + "grad_norm": 1.0859375, + "learning_rate": 8.454193548387097e-05, + "loss": 0.2064, + "step": 10085 + }, + { + "epoch": 0.161376, + "grad_norm": 0.70703125, + "learning_rate": 8.454032258064517e-05, + "loss": 0.1932, + "step": 10086 + }, + { + "epoch": 0.161392, + "grad_norm": 0.8359375, + "learning_rate": 8.453870967741937e-05, + "loss": 0.1526, + "step": 10087 + }, + { + "epoch": 0.161408, + "grad_norm": 0.6640625, + "learning_rate": 8.453709677419355e-05, + "loss": 0.1532, + "step": 10088 + }, + { + "epoch": 0.161424, + "grad_norm": 0.76171875, + "learning_rate": 8.453548387096775e-05, + "loss": 0.1597, + "step": 10089 + }, + { + "epoch": 0.16144, + "grad_norm": 0.7578125, + "learning_rate": 8.453387096774194e-05, + "loss": 0.1911, + "step": 10090 + }, + { + "epoch": 0.161456, + "grad_norm": 0.89453125, + "learning_rate": 8.453225806451614e-05, + "loss": 0.1732, + "step": 10091 + }, + { + "epoch": 0.161472, + "grad_norm": 0.75390625, + "learning_rate": 8.453064516129032e-05, + "loss": 0.1528, + "step": 10092 + }, + { + "epoch": 0.161488, + "grad_norm": 0.474609375, + "learning_rate": 8.452903225806452e-05, + "loss": 0.18, + "step": 10093 + }, + { + "epoch": 0.161504, + "grad_norm": 0.75, + "learning_rate": 8.452741935483871e-05, + "loss": 0.1835, + "step": 10094 + }, + { + "epoch": 0.16152, + "grad_norm": 0.765625, + "learning_rate": 8.452580645161291e-05, + "loss": 0.1713, + "step": 10095 + }, + { + "epoch": 0.161536, + "grad_norm": 0.84375, + "learning_rate": 8.45241935483871e-05, + "loss": 0.1692, + "step": 10096 + }, + { + "epoch": 0.161552, + "grad_norm": 0.50390625, + "learning_rate": 8.45225806451613e-05, + "loss": 0.1703, + "step": 10097 + }, + { + "epoch": 0.161568, + "grad_norm": 0.64453125, + "learning_rate": 8.452096774193548e-05, + "loss": 0.1936, + "step": 10098 + }, + { + "epoch": 0.161584, + "grad_norm": 0.859375, + "learning_rate": 8.451935483870968e-05, + "loss": 0.1532, + "step": 10099 + }, + { + "epoch": 0.1616, + "grad_norm": 0.8671875, + "learning_rate": 8.451774193548388e-05, + "loss": 0.1697, + "step": 10100 + }, + { + "epoch": 0.161616, + "grad_norm": 0.5234375, + "learning_rate": 8.451612903225808e-05, + "loss": 0.1527, + "step": 10101 + }, + { + "epoch": 0.161632, + "grad_norm": 0.65234375, + "learning_rate": 8.451451612903227e-05, + "loss": 0.1527, + "step": 10102 + }, + { + "epoch": 0.161648, + "grad_norm": 0.78125, + "learning_rate": 8.451290322580645e-05, + "loss": 0.1595, + "step": 10103 + }, + { + "epoch": 0.161664, + "grad_norm": 0.7109375, + "learning_rate": 8.451129032258065e-05, + "loss": 0.1767, + "step": 10104 + }, + { + "epoch": 0.16168, + "grad_norm": 0.72265625, + "learning_rate": 8.450967741935484e-05, + "loss": 0.1397, + "step": 10105 + }, + { + "epoch": 0.161696, + "grad_norm": 0.84375, + "learning_rate": 8.450806451612904e-05, + "loss": 0.1543, + "step": 10106 + }, + { + "epoch": 0.161712, + "grad_norm": 0.765625, + "learning_rate": 8.450645161290322e-05, + "loss": 0.1937, + "step": 10107 + }, + { + "epoch": 0.161728, + "grad_norm": 1.1640625, + "learning_rate": 8.450483870967742e-05, + "loss": 0.1764, + "step": 10108 + }, + { + "epoch": 0.161744, + "grad_norm": 1.015625, + "learning_rate": 8.450322580645161e-05, + "loss": 0.2017, + "step": 10109 + }, + { + "epoch": 0.16176, + "grad_norm": 0.703125, + "learning_rate": 8.450161290322581e-05, + "loss": 0.1655, + "step": 10110 + }, + { + "epoch": 0.161776, + "grad_norm": 0.80859375, + "learning_rate": 8.450000000000001e-05, + "loss": 0.182, + "step": 10111 + }, + { + "epoch": 0.161792, + "grad_norm": 0.67578125, + "learning_rate": 8.449838709677421e-05, + "loss": 0.1916, + "step": 10112 + }, + { + "epoch": 0.161808, + "grad_norm": 0.75, + "learning_rate": 8.449677419354839e-05, + "loss": 0.1465, + "step": 10113 + }, + { + "epoch": 0.161824, + "grad_norm": 1.1484375, + "learning_rate": 8.449516129032259e-05, + "loss": 0.1734, + "step": 10114 + }, + { + "epoch": 0.16184, + "grad_norm": 1.265625, + "learning_rate": 8.449354838709678e-05, + "loss": 0.1759, + "step": 10115 + }, + { + "epoch": 0.161856, + "grad_norm": 2.046875, + "learning_rate": 8.449193548387098e-05, + "loss": 0.1771, + "step": 10116 + }, + { + "epoch": 0.161872, + "grad_norm": 1.3359375, + "learning_rate": 8.449032258064516e-05, + "loss": 0.1826, + "step": 10117 + }, + { + "epoch": 0.161888, + "grad_norm": 0.625, + "learning_rate": 8.448870967741936e-05, + "loss": 0.1442, + "step": 10118 + }, + { + "epoch": 0.161904, + "grad_norm": 0.84375, + "learning_rate": 8.448709677419355e-05, + "loss": 0.1747, + "step": 10119 + }, + { + "epoch": 0.16192, + "grad_norm": 0.96875, + "learning_rate": 8.448548387096774e-05, + "loss": 0.1598, + "step": 10120 + }, + { + "epoch": 0.161936, + "grad_norm": 0.8828125, + "learning_rate": 8.448387096774194e-05, + "loss": 0.1501, + "step": 10121 + }, + { + "epoch": 0.161952, + "grad_norm": 0.75, + "learning_rate": 8.448225806451612e-05, + "loss": 0.1429, + "step": 10122 + }, + { + "epoch": 0.161968, + "grad_norm": 0.6640625, + "learning_rate": 8.448064516129032e-05, + "loss": 0.1655, + "step": 10123 + }, + { + "epoch": 0.161984, + "grad_norm": 0.88671875, + "learning_rate": 8.447903225806452e-05, + "loss": 0.1579, + "step": 10124 + }, + { + "epoch": 0.162, + "grad_norm": 0.85546875, + "learning_rate": 8.447741935483872e-05, + "loss": 0.151, + "step": 10125 + }, + { + "epoch": 0.162016, + "grad_norm": 0.84765625, + "learning_rate": 8.447580645161291e-05, + "loss": 0.1578, + "step": 10126 + }, + { + "epoch": 0.162032, + "grad_norm": 0.59765625, + "learning_rate": 8.44741935483871e-05, + "loss": 0.1643, + "step": 10127 + }, + { + "epoch": 0.162048, + "grad_norm": 0.828125, + "learning_rate": 8.447258064516129e-05, + "loss": 0.1963, + "step": 10128 + }, + { + "epoch": 0.162064, + "grad_norm": 0.60546875, + "learning_rate": 8.447096774193549e-05, + "loss": 0.1876, + "step": 10129 + }, + { + "epoch": 0.16208, + "grad_norm": 0.8125, + "learning_rate": 8.446935483870968e-05, + "loss": 0.1775, + "step": 10130 + }, + { + "epoch": 0.162096, + "grad_norm": 0.60546875, + "learning_rate": 8.446774193548388e-05, + "loss": 0.1942, + "step": 10131 + }, + { + "epoch": 0.162112, + "grad_norm": 1.1015625, + "learning_rate": 8.446612903225806e-05, + "loss": 0.1854, + "step": 10132 + }, + { + "epoch": 0.162128, + "grad_norm": 0.62890625, + "learning_rate": 8.446451612903226e-05, + "loss": 0.183, + "step": 10133 + }, + { + "epoch": 0.162144, + "grad_norm": 1.0078125, + "learning_rate": 8.446290322580645e-05, + "loss": 0.1744, + "step": 10134 + }, + { + "epoch": 0.16216, + "grad_norm": 1.1875, + "learning_rate": 8.446129032258065e-05, + "loss": 0.1806, + "step": 10135 + }, + { + "epoch": 0.162176, + "grad_norm": 0.890625, + "learning_rate": 8.445967741935485e-05, + "loss": 0.1368, + "step": 10136 + }, + { + "epoch": 0.162192, + "grad_norm": 0.8515625, + "learning_rate": 8.445806451612904e-05, + "loss": 0.1804, + "step": 10137 + }, + { + "epoch": 0.162208, + "grad_norm": 0.91796875, + "learning_rate": 8.445645161290323e-05, + "loss": 0.1556, + "step": 10138 + }, + { + "epoch": 0.162224, + "grad_norm": 1.2109375, + "learning_rate": 8.445483870967742e-05, + "loss": 0.2275, + "step": 10139 + }, + { + "epoch": 0.16224, + "grad_norm": 0.671875, + "learning_rate": 8.445322580645162e-05, + "loss": 0.1994, + "step": 10140 + }, + { + "epoch": 0.162256, + "grad_norm": 1.1484375, + "learning_rate": 8.44516129032258e-05, + "loss": 0.1819, + "step": 10141 + }, + { + "epoch": 0.162272, + "grad_norm": 0.6015625, + "learning_rate": 8.445e-05, + "loss": 0.1631, + "step": 10142 + }, + { + "epoch": 0.162288, + "grad_norm": 0.8359375, + "learning_rate": 8.444838709677419e-05, + "loss": 0.1804, + "step": 10143 + }, + { + "epoch": 0.162304, + "grad_norm": 1.1015625, + "learning_rate": 8.444677419354839e-05, + "loss": 0.1413, + "step": 10144 + }, + { + "epoch": 0.16232, + "grad_norm": 0.66015625, + "learning_rate": 8.444516129032258e-05, + "loss": 0.1744, + "step": 10145 + }, + { + "epoch": 0.162336, + "grad_norm": 1.0234375, + "learning_rate": 8.444354838709678e-05, + "loss": 0.1668, + "step": 10146 + }, + { + "epoch": 0.162352, + "grad_norm": 0.91796875, + "learning_rate": 8.444193548387098e-05, + "loss": 0.1537, + "step": 10147 + }, + { + "epoch": 0.162368, + "grad_norm": 1.03125, + "learning_rate": 8.444032258064518e-05, + "loss": 0.1829, + "step": 10148 + }, + { + "epoch": 0.162384, + "grad_norm": 0.64453125, + "learning_rate": 8.443870967741936e-05, + "loss": 0.1815, + "step": 10149 + }, + { + "epoch": 0.1624, + "grad_norm": 0.578125, + "learning_rate": 8.443709677419355e-05, + "loss": 0.1417, + "step": 10150 + }, + { + "epoch": 0.162416, + "grad_norm": 1.0546875, + "learning_rate": 8.443548387096775e-05, + "loss": 0.1884, + "step": 10151 + }, + { + "epoch": 0.162432, + "grad_norm": 0.8671875, + "learning_rate": 8.443387096774193e-05, + "loss": 0.1768, + "step": 10152 + }, + { + "epoch": 0.162448, + "grad_norm": 0.75390625, + "learning_rate": 8.443225806451613e-05, + "loss": 0.2014, + "step": 10153 + }, + { + "epoch": 0.162464, + "grad_norm": 0.859375, + "learning_rate": 8.443064516129032e-05, + "loss": 0.1633, + "step": 10154 + }, + { + "epoch": 0.16248, + "grad_norm": 0.703125, + "learning_rate": 8.442903225806452e-05, + "loss": 0.1764, + "step": 10155 + }, + { + "epoch": 0.162496, + "grad_norm": 1.2578125, + "learning_rate": 8.44274193548387e-05, + "loss": 0.1967, + "step": 10156 + }, + { + "epoch": 0.162512, + "grad_norm": 0.72265625, + "learning_rate": 8.44258064516129e-05, + "loss": 0.1696, + "step": 10157 + }, + { + "epoch": 0.162528, + "grad_norm": 1.203125, + "learning_rate": 8.442419354838709e-05, + "loss": 0.1765, + "step": 10158 + }, + { + "epoch": 0.162544, + "grad_norm": 1.359375, + "learning_rate": 8.442258064516129e-05, + "loss": 0.1848, + "step": 10159 + }, + { + "epoch": 0.16256, + "grad_norm": 1.390625, + "learning_rate": 8.442096774193549e-05, + "loss": 0.1661, + "step": 10160 + }, + { + "epoch": 0.162576, + "grad_norm": 0.5703125, + "learning_rate": 8.441935483870969e-05, + "loss": 0.1486, + "step": 10161 + }, + { + "epoch": 0.162592, + "grad_norm": 0.63671875, + "learning_rate": 8.441774193548388e-05, + "loss": 0.1395, + "step": 10162 + }, + { + "epoch": 0.162608, + "grad_norm": 0.7265625, + "learning_rate": 8.441612903225808e-05, + "loss": 0.1802, + "step": 10163 + }, + { + "epoch": 0.162624, + "grad_norm": 0.72265625, + "learning_rate": 8.441451612903226e-05, + "loss": 0.1753, + "step": 10164 + }, + { + "epoch": 0.16264, + "grad_norm": 1.0546875, + "learning_rate": 8.441290322580646e-05, + "loss": 0.1686, + "step": 10165 + }, + { + "epoch": 0.162656, + "grad_norm": 1.46875, + "learning_rate": 8.441129032258065e-05, + "loss": 0.1958, + "step": 10166 + }, + { + "epoch": 0.162672, + "grad_norm": 0.66796875, + "learning_rate": 8.440967741935483e-05, + "loss": 0.1903, + "step": 10167 + }, + { + "epoch": 0.162688, + "grad_norm": 1.0546875, + "learning_rate": 8.440806451612903e-05, + "loss": 0.1802, + "step": 10168 + }, + { + "epoch": 0.162704, + "grad_norm": 1.15625, + "learning_rate": 8.440645161290322e-05, + "loss": 0.191, + "step": 10169 + }, + { + "epoch": 0.16272, + "grad_norm": 0.87890625, + "learning_rate": 8.440483870967742e-05, + "loss": 0.1558, + "step": 10170 + }, + { + "epoch": 0.162736, + "grad_norm": 0.9140625, + "learning_rate": 8.440322580645162e-05, + "loss": 0.18, + "step": 10171 + }, + { + "epoch": 0.162752, + "grad_norm": 1.2421875, + "learning_rate": 8.440161290322582e-05, + "loss": 0.1982, + "step": 10172 + }, + { + "epoch": 0.162768, + "grad_norm": 0.67578125, + "learning_rate": 8.44e-05, + "loss": 0.1273, + "step": 10173 + }, + { + "epoch": 0.162784, + "grad_norm": 1.015625, + "learning_rate": 8.43983870967742e-05, + "loss": 0.2195, + "step": 10174 + }, + { + "epoch": 0.1628, + "grad_norm": 0.54296875, + "learning_rate": 8.439677419354839e-05, + "loss": 0.145, + "step": 10175 + }, + { + "epoch": 0.162816, + "grad_norm": 1.078125, + "learning_rate": 8.439516129032259e-05, + "loss": 0.2055, + "step": 10176 + }, + { + "epoch": 0.162832, + "grad_norm": 0.58984375, + "learning_rate": 8.439354838709678e-05, + "loss": 0.1698, + "step": 10177 + }, + { + "epoch": 0.162848, + "grad_norm": 0.84375, + "learning_rate": 8.439193548387098e-05, + "loss": 0.1714, + "step": 10178 + }, + { + "epoch": 0.162864, + "grad_norm": 1.515625, + "learning_rate": 8.439032258064516e-05, + "loss": 0.1661, + "step": 10179 + }, + { + "epoch": 0.16288, + "grad_norm": 0.80859375, + "learning_rate": 8.438870967741936e-05, + "loss": 0.1824, + "step": 10180 + }, + { + "epoch": 0.162896, + "grad_norm": 1.1640625, + "learning_rate": 8.438709677419355e-05, + "loss": 0.202, + "step": 10181 + }, + { + "epoch": 0.162912, + "grad_norm": 0.88671875, + "learning_rate": 8.438548387096775e-05, + "loss": 0.179, + "step": 10182 + }, + { + "epoch": 0.162928, + "grad_norm": 0.7109375, + "learning_rate": 8.438387096774193e-05, + "loss": 0.1602, + "step": 10183 + }, + { + "epoch": 0.162944, + "grad_norm": 0.6015625, + "learning_rate": 8.438225806451613e-05, + "loss": 0.2001, + "step": 10184 + }, + { + "epoch": 0.16296, + "grad_norm": 1.296875, + "learning_rate": 8.438064516129033e-05, + "loss": 0.1438, + "step": 10185 + }, + { + "epoch": 0.162976, + "grad_norm": 0.6953125, + "learning_rate": 8.437903225806452e-05, + "loss": 0.1438, + "step": 10186 + }, + { + "epoch": 0.162992, + "grad_norm": 0.73046875, + "learning_rate": 8.437741935483872e-05, + "loss": 0.1651, + "step": 10187 + }, + { + "epoch": 0.163008, + "grad_norm": 0.875, + "learning_rate": 8.43758064516129e-05, + "loss": 0.1449, + "step": 10188 + }, + { + "epoch": 0.163024, + "grad_norm": 0.6484375, + "learning_rate": 8.43741935483871e-05, + "loss": 0.1365, + "step": 10189 + }, + { + "epoch": 0.16304, + "grad_norm": 1.6953125, + "learning_rate": 8.437258064516129e-05, + "loss": 0.222, + "step": 10190 + }, + { + "epoch": 0.163056, + "grad_norm": 0.875, + "learning_rate": 8.437096774193549e-05, + "loss": 0.1644, + "step": 10191 + }, + { + "epoch": 0.163072, + "grad_norm": 0.66015625, + "learning_rate": 8.436935483870968e-05, + "loss": 0.1729, + "step": 10192 + }, + { + "epoch": 0.163088, + "grad_norm": 1.046875, + "learning_rate": 8.436774193548388e-05, + "loss": 0.1561, + "step": 10193 + }, + { + "epoch": 0.163104, + "grad_norm": 0.6171875, + "learning_rate": 8.436612903225806e-05, + "loss": 0.1448, + "step": 10194 + }, + { + "epoch": 0.16312, + "grad_norm": 0.65234375, + "learning_rate": 8.436451612903226e-05, + "loss": 0.1918, + "step": 10195 + }, + { + "epoch": 0.163136, + "grad_norm": 1.2734375, + "learning_rate": 8.436290322580646e-05, + "loss": 0.2224, + "step": 10196 + }, + { + "epoch": 0.163152, + "grad_norm": 0.625, + "learning_rate": 8.436129032258065e-05, + "loss": 0.1679, + "step": 10197 + }, + { + "epoch": 0.163168, + "grad_norm": 0.546875, + "learning_rate": 8.435967741935485e-05, + "loss": 0.1771, + "step": 10198 + }, + { + "epoch": 0.163184, + "grad_norm": 0.671875, + "learning_rate": 8.435806451612903e-05, + "loss": 0.169, + "step": 10199 + }, + { + "epoch": 0.1632, + "grad_norm": 0.953125, + "learning_rate": 8.435645161290323e-05, + "loss": 0.1656, + "step": 10200 + }, + { + "epoch": 0.163216, + "grad_norm": 0.74609375, + "learning_rate": 8.435483870967742e-05, + "loss": 0.1557, + "step": 10201 + }, + { + "epoch": 0.163232, + "grad_norm": 0.69140625, + "learning_rate": 8.435322580645162e-05, + "loss": 0.1831, + "step": 10202 + }, + { + "epoch": 0.163248, + "grad_norm": 0.671875, + "learning_rate": 8.43516129032258e-05, + "loss": 0.1963, + "step": 10203 + }, + { + "epoch": 0.163264, + "grad_norm": 0.79296875, + "learning_rate": 8.435e-05, + "loss": 0.1927, + "step": 10204 + }, + { + "epoch": 0.16328, + "grad_norm": 0.91796875, + "learning_rate": 8.434838709677419e-05, + "loss": 0.223, + "step": 10205 + }, + { + "epoch": 0.163296, + "grad_norm": 0.75, + "learning_rate": 8.434677419354839e-05, + "loss": 0.1966, + "step": 10206 + }, + { + "epoch": 0.163312, + "grad_norm": 0.72265625, + "learning_rate": 8.434516129032259e-05, + "loss": 0.167, + "step": 10207 + }, + { + "epoch": 0.163328, + "grad_norm": 0.69921875, + "learning_rate": 8.434354838709679e-05, + "loss": 0.1664, + "step": 10208 + }, + { + "epoch": 0.163344, + "grad_norm": 0.81640625, + "learning_rate": 8.434193548387097e-05, + "loss": 0.1589, + "step": 10209 + }, + { + "epoch": 0.16336, + "grad_norm": 0.67578125, + "learning_rate": 8.434032258064517e-05, + "loss": 0.1901, + "step": 10210 + }, + { + "epoch": 0.163376, + "grad_norm": 0.859375, + "learning_rate": 8.433870967741936e-05, + "loss": 0.155, + "step": 10211 + }, + { + "epoch": 0.163392, + "grad_norm": 0.8359375, + "learning_rate": 8.433709677419355e-05, + "loss": 0.2178, + "step": 10212 + }, + { + "epoch": 0.163408, + "grad_norm": 1.359375, + "learning_rate": 8.433548387096775e-05, + "loss": 0.1984, + "step": 10213 + }, + { + "epoch": 0.163424, + "grad_norm": 0.7265625, + "learning_rate": 8.433387096774193e-05, + "loss": 0.167, + "step": 10214 + }, + { + "epoch": 0.16344, + "grad_norm": 0.71484375, + "learning_rate": 8.433225806451613e-05, + "loss": 0.1796, + "step": 10215 + }, + { + "epoch": 0.163456, + "grad_norm": 0.98046875, + "learning_rate": 8.433064516129032e-05, + "loss": 0.2058, + "step": 10216 + }, + { + "epoch": 0.163472, + "grad_norm": 1.046875, + "learning_rate": 8.432903225806452e-05, + "loss": 0.1945, + "step": 10217 + }, + { + "epoch": 0.163488, + "grad_norm": 1.90625, + "learning_rate": 8.43274193548387e-05, + "loss": 0.1981, + "step": 10218 + }, + { + "epoch": 0.163504, + "grad_norm": 1.2421875, + "learning_rate": 8.43258064516129e-05, + "loss": 0.1985, + "step": 10219 + }, + { + "epoch": 0.16352, + "grad_norm": 1.09375, + "learning_rate": 8.43241935483871e-05, + "loss": 0.1574, + "step": 10220 + }, + { + "epoch": 0.163536, + "grad_norm": 0.82421875, + "learning_rate": 8.43225806451613e-05, + "loss": 0.1565, + "step": 10221 + }, + { + "epoch": 0.163552, + "grad_norm": 0.703125, + "learning_rate": 8.432096774193549e-05, + "loss": 0.155, + "step": 10222 + }, + { + "epoch": 0.163568, + "grad_norm": 0.59765625, + "learning_rate": 8.431935483870969e-05, + "loss": 0.1854, + "step": 10223 + }, + { + "epoch": 0.163584, + "grad_norm": 0.85546875, + "learning_rate": 8.431774193548387e-05, + "loss": 0.156, + "step": 10224 + }, + { + "epoch": 0.1636, + "grad_norm": 0.75, + "learning_rate": 8.431612903225807e-05, + "loss": 0.177, + "step": 10225 + }, + { + "epoch": 0.163616, + "grad_norm": 0.87890625, + "learning_rate": 8.431451612903226e-05, + "loss": 0.1848, + "step": 10226 + }, + { + "epoch": 0.163632, + "grad_norm": 0.85546875, + "learning_rate": 8.431290322580646e-05, + "loss": 0.1269, + "step": 10227 + }, + { + "epoch": 0.163648, + "grad_norm": 0.5625, + "learning_rate": 8.431129032258065e-05, + "loss": 0.1797, + "step": 10228 + }, + { + "epoch": 0.163664, + "grad_norm": 1.0859375, + "learning_rate": 8.430967741935483e-05, + "loss": 0.1796, + "step": 10229 + }, + { + "epoch": 0.16368, + "grad_norm": 0.73046875, + "learning_rate": 8.430806451612903e-05, + "loss": 0.1789, + "step": 10230 + }, + { + "epoch": 0.163696, + "grad_norm": 0.953125, + "learning_rate": 8.430645161290323e-05, + "loss": 0.1559, + "step": 10231 + }, + { + "epoch": 0.163712, + "grad_norm": 0.87109375, + "learning_rate": 8.430483870967743e-05, + "loss": 0.204, + "step": 10232 + }, + { + "epoch": 0.163728, + "grad_norm": 0.75390625, + "learning_rate": 8.430322580645162e-05, + "loss": 0.1949, + "step": 10233 + }, + { + "epoch": 0.163744, + "grad_norm": 0.61328125, + "learning_rate": 8.430161290322582e-05, + "loss": 0.2028, + "step": 10234 + }, + { + "epoch": 0.16376, + "grad_norm": 0.69921875, + "learning_rate": 8.43e-05, + "loss": 0.1869, + "step": 10235 + }, + { + "epoch": 0.163776, + "grad_norm": 0.5859375, + "learning_rate": 8.42983870967742e-05, + "loss": 0.1779, + "step": 10236 + }, + { + "epoch": 0.163792, + "grad_norm": 0.859375, + "learning_rate": 8.429677419354839e-05, + "loss": 0.1647, + "step": 10237 + }, + { + "epoch": 0.163808, + "grad_norm": 0.80859375, + "learning_rate": 8.429516129032259e-05, + "loss": 0.1699, + "step": 10238 + }, + { + "epoch": 0.163824, + "grad_norm": 0.62109375, + "learning_rate": 8.429354838709677e-05, + "loss": 0.1412, + "step": 10239 + }, + { + "epoch": 0.16384, + "grad_norm": 0.64453125, + "learning_rate": 8.429193548387097e-05, + "loss": 0.1529, + "step": 10240 + }, + { + "epoch": 0.163856, + "grad_norm": 1.6640625, + "learning_rate": 8.429032258064516e-05, + "loss": 0.2218, + "step": 10241 + }, + { + "epoch": 0.163872, + "grad_norm": 0.78125, + "learning_rate": 8.428870967741936e-05, + "loss": 0.1452, + "step": 10242 + }, + { + "epoch": 0.163888, + "grad_norm": 0.6328125, + "learning_rate": 8.428709677419356e-05, + "loss": 0.1153, + "step": 10243 + }, + { + "epoch": 0.163904, + "grad_norm": 0.68359375, + "learning_rate": 8.428548387096775e-05, + "loss": 0.1749, + "step": 10244 + }, + { + "epoch": 0.16392, + "grad_norm": 1.0625, + "learning_rate": 8.428387096774194e-05, + "loss": 0.1845, + "step": 10245 + }, + { + "epoch": 0.163936, + "grad_norm": 0.6640625, + "learning_rate": 8.428225806451613e-05, + "loss": 0.1443, + "step": 10246 + }, + { + "epoch": 0.163952, + "grad_norm": 0.96875, + "learning_rate": 8.428064516129033e-05, + "loss": 0.1848, + "step": 10247 + }, + { + "epoch": 0.163968, + "grad_norm": 0.73828125, + "learning_rate": 8.427903225806452e-05, + "loss": 0.1379, + "step": 10248 + }, + { + "epoch": 0.163984, + "grad_norm": 1.1171875, + "learning_rate": 8.427741935483872e-05, + "loss": 0.1768, + "step": 10249 + }, + { + "epoch": 0.164, + "grad_norm": 0.87109375, + "learning_rate": 8.42758064516129e-05, + "loss": 0.1748, + "step": 10250 + }, + { + "epoch": 0.164016, + "grad_norm": 0.62890625, + "learning_rate": 8.42741935483871e-05, + "loss": 0.1655, + "step": 10251 + }, + { + "epoch": 0.164032, + "grad_norm": 0.890625, + "learning_rate": 8.427258064516129e-05, + "loss": 0.2116, + "step": 10252 + }, + { + "epoch": 0.164048, + "grad_norm": 0.8359375, + "learning_rate": 8.427096774193549e-05, + "loss": 0.1682, + "step": 10253 + }, + { + "epoch": 0.164064, + "grad_norm": 0.80078125, + "learning_rate": 8.426935483870967e-05, + "loss": 0.1591, + "step": 10254 + }, + { + "epoch": 0.16408, + "grad_norm": 0.65234375, + "learning_rate": 8.426774193548387e-05, + "loss": 0.1582, + "step": 10255 + }, + { + "epoch": 0.164096, + "grad_norm": 0.53125, + "learning_rate": 8.426612903225807e-05, + "loss": 0.1573, + "step": 10256 + }, + { + "epoch": 0.164112, + "grad_norm": 0.73828125, + "learning_rate": 8.426451612903227e-05, + "loss": 0.1492, + "step": 10257 + }, + { + "epoch": 0.164128, + "grad_norm": 0.61328125, + "learning_rate": 8.426290322580646e-05, + "loss": 0.1717, + "step": 10258 + }, + { + "epoch": 0.164144, + "grad_norm": 0.50390625, + "learning_rate": 8.426129032258064e-05, + "loss": 0.1448, + "step": 10259 + }, + { + "epoch": 0.16416, + "grad_norm": 0.6484375, + "learning_rate": 8.425967741935484e-05, + "loss": 0.175, + "step": 10260 + }, + { + "epoch": 0.164176, + "grad_norm": 0.98828125, + "learning_rate": 8.425806451612903e-05, + "loss": 0.1602, + "step": 10261 + }, + { + "epoch": 0.164192, + "grad_norm": 0.64453125, + "learning_rate": 8.425645161290323e-05, + "loss": 0.1666, + "step": 10262 + }, + { + "epoch": 0.164208, + "grad_norm": 0.95703125, + "learning_rate": 8.425483870967742e-05, + "loss": 0.1557, + "step": 10263 + }, + { + "epoch": 0.164224, + "grad_norm": 0.77734375, + "learning_rate": 8.425322580645162e-05, + "loss": 0.1413, + "step": 10264 + }, + { + "epoch": 0.16424, + "grad_norm": 0.984375, + "learning_rate": 8.42516129032258e-05, + "loss": 0.1962, + "step": 10265 + }, + { + "epoch": 0.164256, + "grad_norm": 0.63671875, + "learning_rate": 8.425e-05, + "loss": 0.1926, + "step": 10266 + }, + { + "epoch": 0.164272, + "grad_norm": 0.69921875, + "learning_rate": 8.42483870967742e-05, + "loss": 0.1802, + "step": 10267 + }, + { + "epoch": 0.164288, + "grad_norm": 0.8984375, + "learning_rate": 8.42467741935484e-05, + "loss": 0.178, + "step": 10268 + }, + { + "epoch": 0.164304, + "grad_norm": 0.75390625, + "learning_rate": 8.424516129032259e-05, + "loss": 0.1802, + "step": 10269 + }, + { + "epoch": 0.16432, + "grad_norm": 0.68359375, + "learning_rate": 8.424354838709679e-05, + "loss": 0.2014, + "step": 10270 + }, + { + "epoch": 0.164336, + "grad_norm": 0.73046875, + "learning_rate": 8.424193548387097e-05, + "loss": 0.2218, + "step": 10271 + }, + { + "epoch": 0.164352, + "grad_norm": 1.203125, + "learning_rate": 8.424032258064517e-05, + "loss": 0.1948, + "step": 10272 + }, + { + "epoch": 0.164368, + "grad_norm": 0.87890625, + "learning_rate": 8.423870967741936e-05, + "loss": 0.1719, + "step": 10273 + }, + { + "epoch": 0.164384, + "grad_norm": 0.76953125, + "learning_rate": 8.423709677419356e-05, + "loss": 0.1755, + "step": 10274 + }, + { + "epoch": 0.1644, + "grad_norm": 0.62890625, + "learning_rate": 8.423548387096774e-05, + "loss": 0.1359, + "step": 10275 + }, + { + "epoch": 0.164416, + "grad_norm": 0.69921875, + "learning_rate": 8.423387096774193e-05, + "loss": 0.143, + "step": 10276 + }, + { + "epoch": 0.164432, + "grad_norm": 0.58203125, + "learning_rate": 8.423225806451613e-05, + "loss": 0.1464, + "step": 10277 + }, + { + "epoch": 0.164448, + "grad_norm": 1.0390625, + "learning_rate": 8.423064516129033e-05, + "loss": 0.185, + "step": 10278 + }, + { + "epoch": 0.164464, + "grad_norm": 0.8359375, + "learning_rate": 8.422903225806452e-05, + "loss": 0.1876, + "step": 10279 + }, + { + "epoch": 0.16448, + "grad_norm": 0.98828125, + "learning_rate": 8.422741935483871e-05, + "loss": 0.1526, + "step": 10280 + }, + { + "epoch": 0.164496, + "grad_norm": 0.5546875, + "learning_rate": 8.422580645161291e-05, + "loss": 0.1723, + "step": 10281 + }, + { + "epoch": 0.164512, + "grad_norm": 0.953125, + "learning_rate": 8.42241935483871e-05, + "loss": 0.162, + "step": 10282 + }, + { + "epoch": 0.164528, + "grad_norm": 0.7578125, + "learning_rate": 8.42225806451613e-05, + "loss": 0.1549, + "step": 10283 + }, + { + "epoch": 0.164544, + "grad_norm": 0.67578125, + "learning_rate": 8.422096774193549e-05, + "loss": 0.1475, + "step": 10284 + }, + { + "epoch": 0.16456, + "grad_norm": 0.90234375, + "learning_rate": 8.421935483870969e-05, + "loss": 0.1381, + "step": 10285 + }, + { + "epoch": 0.164576, + "grad_norm": 0.5703125, + "learning_rate": 8.421774193548387e-05, + "loss": 0.1983, + "step": 10286 + }, + { + "epoch": 0.164592, + "grad_norm": 0.609375, + "learning_rate": 8.421612903225807e-05, + "loss": 0.1898, + "step": 10287 + }, + { + "epoch": 0.164608, + "grad_norm": 0.99609375, + "learning_rate": 8.421451612903226e-05, + "loss": 0.1476, + "step": 10288 + }, + { + "epoch": 0.164624, + "grad_norm": 0.77734375, + "learning_rate": 8.421290322580646e-05, + "loss": 0.1574, + "step": 10289 + }, + { + "epoch": 0.16464, + "grad_norm": 0.77734375, + "learning_rate": 8.421129032258064e-05, + "loss": 0.1561, + "step": 10290 + }, + { + "epoch": 0.164656, + "grad_norm": 1.046875, + "learning_rate": 8.420967741935484e-05, + "loss": 0.1735, + "step": 10291 + }, + { + "epoch": 0.164672, + "grad_norm": 0.9609375, + "learning_rate": 8.420806451612904e-05, + "loss": 0.2002, + "step": 10292 + }, + { + "epoch": 0.164688, + "grad_norm": 0.88671875, + "learning_rate": 8.420645161290323e-05, + "loss": 0.193, + "step": 10293 + }, + { + "epoch": 0.164704, + "grad_norm": 0.77734375, + "learning_rate": 8.420483870967743e-05, + "loss": 0.1849, + "step": 10294 + }, + { + "epoch": 0.16472, + "grad_norm": 0.66796875, + "learning_rate": 8.420322580645161e-05, + "loss": 0.2058, + "step": 10295 + }, + { + "epoch": 0.164736, + "grad_norm": 0.9765625, + "learning_rate": 8.420161290322581e-05, + "loss": 0.2284, + "step": 10296 + }, + { + "epoch": 0.164752, + "grad_norm": 0.640625, + "learning_rate": 8.42e-05, + "loss": 0.1537, + "step": 10297 + }, + { + "epoch": 0.164768, + "grad_norm": 0.6171875, + "learning_rate": 8.41983870967742e-05, + "loss": 0.1174, + "step": 10298 + }, + { + "epoch": 0.164784, + "grad_norm": 0.66015625, + "learning_rate": 8.419677419354839e-05, + "loss": 0.1688, + "step": 10299 + }, + { + "epoch": 0.1648, + "grad_norm": 0.56640625, + "learning_rate": 8.419516129032259e-05, + "loss": 0.1495, + "step": 10300 + }, + { + "epoch": 0.164816, + "grad_norm": 0.8359375, + "learning_rate": 8.419354838709677e-05, + "loss": 0.1584, + "step": 10301 + }, + { + "epoch": 0.164832, + "grad_norm": 0.76171875, + "learning_rate": 8.419193548387097e-05, + "loss": 0.1767, + "step": 10302 + }, + { + "epoch": 0.164848, + "grad_norm": 0.70703125, + "learning_rate": 8.419032258064517e-05, + "loss": 0.2113, + "step": 10303 + }, + { + "epoch": 0.164864, + "grad_norm": 1.1484375, + "learning_rate": 8.418870967741937e-05, + "loss": 0.2141, + "step": 10304 + }, + { + "epoch": 0.16488, + "grad_norm": 1.1015625, + "learning_rate": 8.418709677419356e-05, + "loss": 0.204, + "step": 10305 + }, + { + "epoch": 0.164896, + "grad_norm": 1.3984375, + "learning_rate": 8.418548387096774e-05, + "loss": 0.1646, + "step": 10306 + }, + { + "epoch": 0.164912, + "grad_norm": 0.6484375, + "learning_rate": 8.418387096774194e-05, + "loss": 0.1911, + "step": 10307 + }, + { + "epoch": 0.164928, + "grad_norm": 0.7109375, + "learning_rate": 8.418225806451613e-05, + "loss": 0.1773, + "step": 10308 + }, + { + "epoch": 0.164944, + "grad_norm": 0.9140625, + "learning_rate": 8.418064516129033e-05, + "loss": 0.1238, + "step": 10309 + }, + { + "epoch": 0.16496, + "grad_norm": 1.34375, + "learning_rate": 8.417903225806451e-05, + "loss": 0.1783, + "step": 10310 + }, + { + "epoch": 0.164976, + "grad_norm": 0.84765625, + "learning_rate": 8.417741935483871e-05, + "loss": 0.1682, + "step": 10311 + }, + { + "epoch": 0.164992, + "grad_norm": 0.73046875, + "learning_rate": 8.41758064516129e-05, + "loss": 0.1522, + "step": 10312 + }, + { + "epoch": 0.165008, + "grad_norm": 0.79296875, + "learning_rate": 8.41741935483871e-05, + "loss": 0.1896, + "step": 10313 + }, + { + "epoch": 0.165024, + "grad_norm": 0.77734375, + "learning_rate": 8.417258064516129e-05, + "loss": 0.2244, + "step": 10314 + }, + { + "epoch": 0.16504, + "grad_norm": 0.74609375, + "learning_rate": 8.417096774193549e-05, + "loss": 0.1785, + "step": 10315 + }, + { + "epoch": 0.165056, + "grad_norm": 0.6640625, + "learning_rate": 8.416935483870968e-05, + "loss": 0.1776, + "step": 10316 + }, + { + "epoch": 0.165072, + "grad_norm": 1.0859375, + "learning_rate": 8.416774193548388e-05, + "loss": 0.1462, + "step": 10317 + }, + { + "epoch": 0.165088, + "grad_norm": 0.6015625, + "learning_rate": 8.416612903225807e-05, + "loss": 0.1647, + "step": 10318 + }, + { + "epoch": 0.165104, + "grad_norm": 0.515625, + "learning_rate": 8.416451612903227e-05, + "loss": 0.1546, + "step": 10319 + }, + { + "epoch": 0.16512, + "grad_norm": 0.5703125, + "learning_rate": 8.416290322580646e-05, + "loss": 0.1561, + "step": 10320 + }, + { + "epoch": 0.165136, + "grad_norm": 0.49609375, + "learning_rate": 8.416129032258064e-05, + "loss": 0.127, + "step": 10321 + }, + { + "epoch": 0.165152, + "grad_norm": 0.796875, + "learning_rate": 8.415967741935484e-05, + "loss": 0.1435, + "step": 10322 + }, + { + "epoch": 0.165168, + "grad_norm": 0.78515625, + "learning_rate": 8.415806451612903e-05, + "loss": 0.1674, + "step": 10323 + }, + { + "epoch": 0.165184, + "grad_norm": 1.140625, + "learning_rate": 8.415645161290323e-05, + "loss": 0.2004, + "step": 10324 + }, + { + "epoch": 0.1652, + "grad_norm": 0.8515625, + "learning_rate": 8.415483870967741e-05, + "loss": 0.1781, + "step": 10325 + }, + { + "epoch": 0.165216, + "grad_norm": 1.25, + "learning_rate": 8.415322580645161e-05, + "loss": 0.1955, + "step": 10326 + }, + { + "epoch": 0.165232, + "grad_norm": 1.0234375, + "learning_rate": 8.415161290322581e-05, + "loss": 0.1946, + "step": 10327 + }, + { + "epoch": 0.165248, + "grad_norm": 0.80078125, + "learning_rate": 8.415000000000001e-05, + "loss": 0.1715, + "step": 10328 + }, + { + "epoch": 0.165264, + "grad_norm": 0.58984375, + "learning_rate": 8.41483870967742e-05, + "loss": 0.1755, + "step": 10329 + }, + { + "epoch": 0.16528, + "grad_norm": 1.0625, + "learning_rate": 8.41467741935484e-05, + "loss": 0.1904, + "step": 10330 + }, + { + "epoch": 0.165296, + "grad_norm": 0.6171875, + "learning_rate": 8.414516129032258e-05, + "loss": 0.1505, + "step": 10331 + }, + { + "epoch": 0.165312, + "grad_norm": 0.76171875, + "learning_rate": 8.414354838709678e-05, + "loss": 0.1793, + "step": 10332 + }, + { + "epoch": 0.165328, + "grad_norm": 0.60546875, + "learning_rate": 8.414193548387097e-05, + "loss": 0.1447, + "step": 10333 + }, + { + "epoch": 0.165344, + "grad_norm": 0.73828125, + "learning_rate": 8.414032258064517e-05, + "loss": 0.1225, + "step": 10334 + }, + { + "epoch": 0.16536, + "grad_norm": 0.71875, + "learning_rate": 8.413870967741936e-05, + "loss": 0.1992, + "step": 10335 + }, + { + "epoch": 0.165376, + "grad_norm": 0.9375, + "learning_rate": 8.413709677419356e-05, + "loss": 0.1922, + "step": 10336 + }, + { + "epoch": 0.165392, + "grad_norm": 0.6953125, + "learning_rate": 8.413548387096774e-05, + "loss": 0.1687, + "step": 10337 + }, + { + "epoch": 0.165408, + "grad_norm": 0.703125, + "learning_rate": 8.413387096774194e-05, + "loss": 0.1598, + "step": 10338 + }, + { + "epoch": 0.165424, + "grad_norm": 1.09375, + "learning_rate": 8.413225806451614e-05, + "loss": 0.1707, + "step": 10339 + }, + { + "epoch": 0.16544, + "grad_norm": 0.86328125, + "learning_rate": 8.413064516129033e-05, + "loss": 0.172, + "step": 10340 + }, + { + "epoch": 0.165456, + "grad_norm": 0.9375, + "learning_rate": 8.412903225806453e-05, + "loss": 0.1794, + "step": 10341 + }, + { + "epoch": 0.165472, + "grad_norm": 0.51171875, + "learning_rate": 8.412741935483871e-05, + "loss": 0.1658, + "step": 10342 + }, + { + "epoch": 0.165488, + "grad_norm": 0.76953125, + "learning_rate": 8.412580645161291e-05, + "loss": 0.1989, + "step": 10343 + }, + { + "epoch": 0.165504, + "grad_norm": 0.439453125, + "learning_rate": 8.41241935483871e-05, + "loss": 0.171, + "step": 10344 + }, + { + "epoch": 0.16552, + "grad_norm": 0.92578125, + "learning_rate": 8.41225806451613e-05, + "loss": 0.1899, + "step": 10345 + }, + { + "epoch": 0.165536, + "grad_norm": 0.68359375, + "learning_rate": 8.412096774193548e-05, + "loss": 0.1822, + "step": 10346 + }, + { + "epoch": 0.165552, + "grad_norm": 0.75, + "learning_rate": 8.411935483870968e-05, + "loss": 0.2002, + "step": 10347 + }, + { + "epoch": 0.165568, + "grad_norm": 0.6796875, + "learning_rate": 8.411774193548387e-05, + "loss": 0.1768, + "step": 10348 + }, + { + "epoch": 0.165584, + "grad_norm": 0.62109375, + "learning_rate": 8.411612903225807e-05, + "loss": 0.1812, + "step": 10349 + }, + { + "epoch": 0.1656, + "grad_norm": 0.71484375, + "learning_rate": 8.411451612903226e-05, + "loss": 0.151, + "step": 10350 + }, + { + "epoch": 0.165616, + "grad_norm": 0.6015625, + "learning_rate": 8.411290322580645e-05, + "loss": 0.1662, + "step": 10351 + }, + { + "epoch": 0.165632, + "grad_norm": 1.0625, + "learning_rate": 8.411129032258065e-05, + "loss": 0.166, + "step": 10352 + }, + { + "epoch": 0.165648, + "grad_norm": 0.78515625, + "learning_rate": 8.410967741935484e-05, + "loss": 0.168, + "step": 10353 + }, + { + "epoch": 0.165664, + "grad_norm": 1.546875, + "learning_rate": 8.410806451612904e-05, + "loss": 0.2214, + "step": 10354 + }, + { + "epoch": 0.16568, + "grad_norm": 1.4296875, + "learning_rate": 8.410645161290323e-05, + "loss": 0.1621, + "step": 10355 + }, + { + "epoch": 0.165696, + "grad_norm": 0.67578125, + "learning_rate": 8.410483870967743e-05, + "loss": 0.1486, + "step": 10356 + }, + { + "epoch": 0.165712, + "grad_norm": 0.703125, + "learning_rate": 8.410322580645161e-05, + "loss": 0.145, + "step": 10357 + }, + { + "epoch": 0.165728, + "grad_norm": 0.7265625, + "learning_rate": 8.410161290322581e-05, + "loss": 0.1496, + "step": 10358 + }, + { + "epoch": 0.165744, + "grad_norm": 0.671875, + "learning_rate": 8.41e-05, + "loss": 0.2014, + "step": 10359 + }, + { + "epoch": 0.16576, + "grad_norm": 0.7890625, + "learning_rate": 8.40983870967742e-05, + "loss": 0.182, + "step": 10360 + }, + { + "epoch": 0.165776, + "grad_norm": 0.57421875, + "learning_rate": 8.409677419354838e-05, + "loss": 0.1493, + "step": 10361 + }, + { + "epoch": 0.165792, + "grad_norm": 1.1875, + "learning_rate": 8.409516129032258e-05, + "loss": 0.2648, + "step": 10362 + }, + { + "epoch": 0.165808, + "grad_norm": 0.640625, + "learning_rate": 8.409354838709678e-05, + "loss": 0.1781, + "step": 10363 + }, + { + "epoch": 0.165824, + "grad_norm": 0.796875, + "learning_rate": 8.409193548387098e-05, + "loss": 0.1863, + "step": 10364 + }, + { + "epoch": 0.16584, + "grad_norm": 1.1015625, + "learning_rate": 8.409032258064517e-05, + "loss": 0.2481, + "step": 10365 + }, + { + "epoch": 0.165856, + "grad_norm": 1.0, + "learning_rate": 8.408870967741937e-05, + "loss": 0.2011, + "step": 10366 + }, + { + "epoch": 0.165872, + "grad_norm": 0.96484375, + "learning_rate": 8.408709677419355e-05, + "loss": 0.1471, + "step": 10367 + }, + { + "epoch": 0.165888, + "grad_norm": 0.6328125, + "learning_rate": 8.408548387096774e-05, + "loss": 0.1557, + "step": 10368 + }, + { + "epoch": 0.165904, + "grad_norm": 0.796875, + "learning_rate": 8.408387096774194e-05, + "loss": 0.1386, + "step": 10369 + }, + { + "epoch": 0.16592, + "grad_norm": 0.7890625, + "learning_rate": 8.408225806451613e-05, + "loss": 0.1846, + "step": 10370 + }, + { + "epoch": 0.165936, + "grad_norm": 0.6640625, + "learning_rate": 8.408064516129033e-05, + "loss": 0.1774, + "step": 10371 + }, + { + "epoch": 0.165952, + "grad_norm": 0.85546875, + "learning_rate": 8.407903225806451e-05, + "loss": 0.1923, + "step": 10372 + }, + { + "epoch": 0.165968, + "grad_norm": 0.59765625, + "learning_rate": 8.407741935483871e-05, + "loss": 0.1528, + "step": 10373 + }, + { + "epoch": 0.165984, + "grad_norm": 1.0703125, + "learning_rate": 8.40758064516129e-05, + "loss": 0.1996, + "step": 10374 + }, + { + "epoch": 0.166, + "grad_norm": 0.953125, + "learning_rate": 8.40741935483871e-05, + "loss": 0.1481, + "step": 10375 + }, + { + "epoch": 0.166016, + "grad_norm": 0.69921875, + "learning_rate": 8.40725806451613e-05, + "loss": 0.1636, + "step": 10376 + }, + { + "epoch": 0.166032, + "grad_norm": 0.71484375, + "learning_rate": 8.40709677419355e-05, + "loss": 0.1561, + "step": 10377 + }, + { + "epoch": 0.166048, + "grad_norm": 0.98046875, + "learning_rate": 8.406935483870968e-05, + "loss": 0.1671, + "step": 10378 + }, + { + "epoch": 0.166064, + "grad_norm": 0.54296875, + "learning_rate": 8.406774193548388e-05, + "loss": 0.152, + "step": 10379 + }, + { + "epoch": 0.16608, + "grad_norm": 0.671875, + "learning_rate": 8.406612903225807e-05, + "loss": 0.1802, + "step": 10380 + }, + { + "epoch": 0.166096, + "grad_norm": 0.62890625, + "learning_rate": 8.406451612903227e-05, + "loss": 0.2034, + "step": 10381 + }, + { + "epoch": 0.166112, + "grad_norm": 0.6953125, + "learning_rate": 8.406290322580645e-05, + "loss": 0.17, + "step": 10382 + }, + { + "epoch": 0.166128, + "grad_norm": 0.828125, + "learning_rate": 8.406129032258065e-05, + "loss": 0.2071, + "step": 10383 + }, + { + "epoch": 0.166144, + "grad_norm": 0.86328125, + "learning_rate": 8.405967741935484e-05, + "loss": 0.1784, + "step": 10384 + }, + { + "epoch": 0.16616, + "grad_norm": 1.1328125, + "learning_rate": 8.405806451612903e-05, + "loss": 0.1643, + "step": 10385 + }, + { + "epoch": 0.166176, + "grad_norm": 0.69921875, + "learning_rate": 8.405645161290323e-05, + "loss": 0.1595, + "step": 10386 + }, + { + "epoch": 0.166192, + "grad_norm": 0.71875, + "learning_rate": 8.405483870967742e-05, + "loss": 0.1614, + "step": 10387 + }, + { + "epoch": 0.166208, + "grad_norm": 0.72265625, + "learning_rate": 8.405322580645162e-05, + "loss": 0.1853, + "step": 10388 + }, + { + "epoch": 0.166224, + "grad_norm": 0.57421875, + "learning_rate": 8.405161290322581e-05, + "loss": 0.1714, + "step": 10389 + }, + { + "epoch": 0.16624, + "grad_norm": 0.73828125, + "learning_rate": 8.405000000000001e-05, + "loss": 0.1781, + "step": 10390 + }, + { + "epoch": 0.166256, + "grad_norm": 1.1875, + "learning_rate": 8.40483870967742e-05, + "loss": 0.1813, + "step": 10391 + }, + { + "epoch": 0.166272, + "grad_norm": 0.5078125, + "learning_rate": 8.40467741935484e-05, + "loss": 0.1281, + "step": 10392 + }, + { + "epoch": 0.166288, + "grad_norm": 0.734375, + "learning_rate": 8.404516129032258e-05, + "loss": 0.1504, + "step": 10393 + }, + { + "epoch": 0.166304, + "grad_norm": 0.640625, + "learning_rate": 8.404354838709678e-05, + "loss": 0.1396, + "step": 10394 + }, + { + "epoch": 0.16632, + "grad_norm": 0.671875, + "learning_rate": 8.404193548387097e-05, + "loss": 0.1583, + "step": 10395 + }, + { + "epoch": 0.166336, + "grad_norm": 0.890625, + "learning_rate": 8.404032258064517e-05, + "loss": 0.1534, + "step": 10396 + }, + { + "epoch": 0.166352, + "grad_norm": 0.79296875, + "learning_rate": 8.403870967741935e-05, + "loss": 0.1615, + "step": 10397 + }, + { + "epoch": 0.166368, + "grad_norm": 1.203125, + "learning_rate": 8.403709677419355e-05, + "loss": 0.187, + "step": 10398 + }, + { + "epoch": 0.166384, + "grad_norm": 0.9609375, + "learning_rate": 8.403548387096775e-05, + "loss": 0.1725, + "step": 10399 + }, + { + "epoch": 0.1664, + "grad_norm": 0.69140625, + "learning_rate": 8.403387096774194e-05, + "loss": 0.1803, + "step": 10400 + }, + { + "epoch": 0.166416, + "grad_norm": 0.92578125, + "learning_rate": 8.403225806451614e-05, + "loss": 0.1789, + "step": 10401 + }, + { + "epoch": 0.166432, + "grad_norm": 0.88671875, + "learning_rate": 8.403064516129032e-05, + "loss": 0.1455, + "step": 10402 + }, + { + "epoch": 0.166448, + "grad_norm": 1.328125, + "learning_rate": 8.402903225806452e-05, + "loss": 0.19, + "step": 10403 + }, + { + "epoch": 0.166464, + "grad_norm": 1.1328125, + "learning_rate": 8.402741935483871e-05, + "loss": 0.1834, + "step": 10404 + }, + { + "epoch": 0.16648, + "grad_norm": 0.88671875, + "learning_rate": 8.402580645161291e-05, + "loss": 0.1537, + "step": 10405 + }, + { + "epoch": 0.166496, + "grad_norm": 0.98828125, + "learning_rate": 8.40241935483871e-05, + "loss": 0.1575, + "step": 10406 + }, + { + "epoch": 0.166512, + "grad_norm": 0.7421875, + "learning_rate": 8.40225806451613e-05, + "loss": 0.1746, + "step": 10407 + }, + { + "epoch": 0.166528, + "grad_norm": 0.640625, + "learning_rate": 8.402096774193548e-05, + "loss": 0.2106, + "step": 10408 + }, + { + "epoch": 0.166544, + "grad_norm": 0.76171875, + "learning_rate": 8.401935483870968e-05, + "loss": 0.1668, + "step": 10409 + }, + { + "epoch": 0.16656, + "grad_norm": 1.4375, + "learning_rate": 8.401774193548387e-05, + "loss": 0.1566, + "step": 10410 + }, + { + "epoch": 0.166576, + "grad_norm": 0.65234375, + "learning_rate": 8.401612903225807e-05, + "loss": 0.1869, + "step": 10411 + }, + { + "epoch": 0.166592, + "grad_norm": 0.8984375, + "learning_rate": 8.401451612903227e-05, + "loss": 0.1642, + "step": 10412 + }, + { + "epoch": 0.166608, + "grad_norm": 0.76171875, + "learning_rate": 8.401290322580647e-05, + "loss": 0.155, + "step": 10413 + }, + { + "epoch": 0.166624, + "grad_norm": 0.8359375, + "learning_rate": 8.401129032258065e-05, + "loss": 0.1933, + "step": 10414 + }, + { + "epoch": 0.16664, + "grad_norm": 1.046875, + "learning_rate": 8.400967741935484e-05, + "loss": 0.1916, + "step": 10415 + }, + { + "epoch": 0.166656, + "grad_norm": 0.67578125, + "learning_rate": 8.400806451612904e-05, + "loss": 0.1586, + "step": 10416 + }, + { + "epoch": 0.166672, + "grad_norm": 0.828125, + "learning_rate": 8.400645161290322e-05, + "loss": 0.1743, + "step": 10417 + }, + { + "epoch": 0.166688, + "grad_norm": 0.88671875, + "learning_rate": 8.400483870967742e-05, + "loss": 0.1532, + "step": 10418 + }, + { + "epoch": 0.166704, + "grad_norm": 0.5703125, + "learning_rate": 8.400322580645161e-05, + "loss": 0.1835, + "step": 10419 + }, + { + "epoch": 0.16672, + "grad_norm": 0.83984375, + "learning_rate": 8.400161290322581e-05, + "loss": 0.1848, + "step": 10420 + }, + { + "epoch": 0.166736, + "grad_norm": 0.59765625, + "learning_rate": 8.4e-05, + "loss": 0.1614, + "step": 10421 + }, + { + "epoch": 0.166752, + "grad_norm": 1.1484375, + "learning_rate": 8.39983870967742e-05, + "loss": 0.1962, + "step": 10422 + }, + { + "epoch": 0.166768, + "grad_norm": 0.65625, + "learning_rate": 8.39967741935484e-05, + "loss": 0.1611, + "step": 10423 + }, + { + "epoch": 0.166784, + "grad_norm": 0.6796875, + "learning_rate": 8.39951612903226e-05, + "loss": 0.1588, + "step": 10424 + }, + { + "epoch": 0.1668, + "grad_norm": 0.70703125, + "learning_rate": 8.399354838709678e-05, + "loss": 0.1768, + "step": 10425 + }, + { + "epoch": 0.166816, + "grad_norm": 0.765625, + "learning_rate": 8.399193548387098e-05, + "loss": 0.1536, + "step": 10426 + }, + { + "epoch": 0.166832, + "grad_norm": 1.1953125, + "learning_rate": 8.399032258064517e-05, + "loss": 0.1951, + "step": 10427 + }, + { + "epoch": 0.166848, + "grad_norm": 0.90234375, + "learning_rate": 8.398870967741937e-05, + "loss": 0.1734, + "step": 10428 + }, + { + "epoch": 0.166864, + "grad_norm": 0.90234375, + "learning_rate": 8.398709677419355e-05, + "loss": 0.1423, + "step": 10429 + }, + { + "epoch": 0.16688, + "grad_norm": 0.80078125, + "learning_rate": 8.398548387096774e-05, + "loss": 0.1507, + "step": 10430 + }, + { + "epoch": 0.166896, + "grad_norm": 0.8984375, + "learning_rate": 8.398387096774194e-05, + "loss": 0.1837, + "step": 10431 + }, + { + "epoch": 0.166912, + "grad_norm": 0.64453125, + "learning_rate": 8.398225806451612e-05, + "loss": 0.1681, + "step": 10432 + }, + { + "epoch": 0.166928, + "grad_norm": 1.21875, + "learning_rate": 8.398064516129032e-05, + "loss": 0.1916, + "step": 10433 + }, + { + "epoch": 0.166944, + "grad_norm": 0.90234375, + "learning_rate": 8.397903225806452e-05, + "loss": 0.195, + "step": 10434 + }, + { + "epoch": 0.16696, + "grad_norm": 1.640625, + "learning_rate": 8.397741935483871e-05, + "loss": 0.1745, + "step": 10435 + }, + { + "epoch": 0.166976, + "grad_norm": 0.8984375, + "learning_rate": 8.397580645161291e-05, + "loss": 0.2014, + "step": 10436 + }, + { + "epoch": 0.166992, + "grad_norm": 0.703125, + "learning_rate": 8.397419354838711e-05, + "loss": 0.1466, + "step": 10437 + }, + { + "epoch": 0.167008, + "grad_norm": 1.1484375, + "learning_rate": 8.39725806451613e-05, + "loss": 0.1829, + "step": 10438 + }, + { + "epoch": 0.167024, + "grad_norm": 0.8515625, + "learning_rate": 8.39709677419355e-05, + "loss": 0.1839, + "step": 10439 + }, + { + "epoch": 0.16704, + "grad_norm": 0.75390625, + "learning_rate": 8.396935483870968e-05, + "loss": 0.1633, + "step": 10440 + }, + { + "epoch": 0.167056, + "grad_norm": 0.87890625, + "learning_rate": 8.396774193548388e-05, + "loss": 0.1883, + "step": 10441 + }, + { + "epoch": 0.167072, + "grad_norm": 0.93359375, + "learning_rate": 8.396612903225807e-05, + "loss": 0.1984, + "step": 10442 + }, + { + "epoch": 0.167088, + "grad_norm": 0.8671875, + "learning_rate": 8.396451612903227e-05, + "loss": 0.1775, + "step": 10443 + }, + { + "epoch": 0.167104, + "grad_norm": 0.73046875, + "learning_rate": 8.396290322580645e-05, + "loss": 0.1837, + "step": 10444 + }, + { + "epoch": 0.16712, + "grad_norm": 0.90625, + "learning_rate": 8.396129032258065e-05, + "loss": 0.1759, + "step": 10445 + }, + { + "epoch": 0.167136, + "grad_norm": 0.81640625, + "learning_rate": 8.395967741935484e-05, + "loss": 0.1569, + "step": 10446 + }, + { + "epoch": 0.167152, + "grad_norm": 0.640625, + "learning_rate": 8.395806451612904e-05, + "loss": 0.1195, + "step": 10447 + }, + { + "epoch": 0.167168, + "grad_norm": 0.65625, + "learning_rate": 8.395645161290324e-05, + "loss": 0.1663, + "step": 10448 + }, + { + "epoch": 0.167184, + "grad_norm": 0.7421875, + "learning_rate": 8.395483870967742e-05, + "loss": 0.2034, + "step": 10449 + }, + { + "epoch": 0.1672, + "grad_norm": 0.7109375, + "learning_rate": 8.395322580645162e-05, + "loss": 0.1938, + "step": 10450 + }, + { + "epoch": 0.167216, + "grad_norm": 0.71484375, + "learning_rate": 8.395161290322581e-05, + "loss": 0.1858, + "step": 10451 + }, + { + "epoch": 0.167232, + "grad_norm": 0.7890625, + "learning_rate": 8.395000000000001e-05, + "loss": 0.2338, + "step": 10452 + }, + { + "epoch": 0.167248, + "grad_norm": 0.85546875, + "learning_rate": 8.39483870967742e-05, + "loss": 0.1835, + "step": 10453 + }, + { + "epoch": 0.167264, + "grad_norm": 0.9296875, + "learning_rate": 8.39467741935484e-05, + "loss": 0.1697, + "step": 10454 + }, + { + "epoch": 0.16728, + "grad_norm": 0.7578125, + "learning_rate": 8.394516129032258e-05, + "loss": 0.1681, + "step": 10455 + }, + { + "epoch": 0.167296, + "grad_norm": 0.625, + "learning_rate": 8.394354838709678e-05, + "loss": 0.1598, + "step": 10456 + }, + { + "epoch": 0.167312, + "grad_norm": 0.54296875, + "learning_rate": 8.394193548387097e-05, + "loss": 0.1854, + "step": 10457 + }, + { + "epoch": 0.167328, + "grad_norm": 0.9453125, + "learning_rate": 8.394032258064516e-05, + "loss": 0.1736, + "step": 10458 + }, + { + "epoch": 0.167344, + "grad_norm": 1.015625, + "learning_rate": 8.393870967741936e-05, + "loss": 0.1554, + "step": 10459 + }, + { + "epoch": 0.16736, + "grad_norm": 0.578125, + "learning_rate": 8.393709677419356e-05, + "loss": 0.1349, + "step": 10460 + }, + { + "epoch": 0.167376, + "grad_norm": 0.859375, + "learning_rate": 8.393548387096775e-05, + "loss": 0.1823, + "step": 10461 + }, + { + "epoch": 0.167392, + "grad_norm": 0.84765625, + "learning_rate": 8.393387096774194e-05, + "loss": 0.1524, + "step": 10462 + }, + { + "epoch": 0.167408, + "grad_norm": 0.7265625, + "learning_rate": 8.393225806451614e-05, + "loss": 0.1193, + "step": 10463 + }, + { + "epoch": 0.167424, + "grad_norm": 0.9921875, + "learning_rate": 8.393064516129032e-05, + "loss": 0.1683, + "step": 10464 + }, + { + "epoch": 0.16744, + "grad_norm": 0.6953125, + "learning_rate": 8.392903225806452e-05, + "loss": 0.1826, + "step": 10465 + }, + { + "epoch": 0.167456, + "grad_norm": 1.3515625, + "learning_rate": 8.392741935483871e-05, + "loss": 0.2114, + "step": 10466 + }, + { + "epoch": 0.167472, + "grad_norm": 0.79296875, + "learning_rate": 8.392580645161291e-05, + "loss": 0.123, + "step": 10467 + }, + { + "epoch": 0.167488, + "grad_norm": 0.71875, + "learning_rate": 8.39241935483871e-05, + "loss": 0.1845, + "step": 10468 + }, + { + "epoch": 0.167504, + "grad_norm": 0.921875, + "learning_rate": 8.392258064516129e-05, + "loss": 0.1477, + "step": 10469 + }, + { + "epoch": 0.16752, + "grad_norm": 0.8671875, + "learning_rate": 8.392096774193548e-05, + "loss": 0.1251, + "step": 10470 + }, + { + "epoch": 0.167536, + "grad_norm": 1.328125, + "learning_rate": 8.391935483870968e-05, + "loss": 0.202, + "step": 10471 + }, + { + "epoch": 0.167552, + "grad_norm": 1.1171875, + "learning_rate": 8.391774193548388e-05, + "loss": 0.217, + "step": 10472 + }, + { + "epoch": 0.167568, + "grad_norm": 0.66015625, + "learning_rate": 8.391612903225808e-05, + "loss": 0.198, + "step": 10473 + }, + { + "epoch": 0.167584, + "grad_norm": 0.6015625, + "learning_rate": 8.391451612903226e-05, + "loss": 0.1659, + "step": 10474 + }, + { + "epoch": 0.1676, + "grad_norm": 0.625, + "learning_rate": 8.391290322580646e-05, + "loss": 0.1855, + "step": 10475 + }, + { + "epoch": 0.167616, + "grad_norm": 0.59375, + "learning_rate": 8.391129032258065e-05, + "loss": 0.142, + "step": 10476 + }, + { + "epoch": 0.167632, + "grad_norm": 0.55859375, + "learning_rate": 8.390967741935484e-05, + "loss": 0.1465, + "step": 10477 + }, + { + "epoch": 0.167648, + "grad_norm": 0.57421875, + "learning_rate": 8.390806451612904e-05, + "loss": 0.1748, + "step": 10478 + }, + { + "epoch": 0.167664, + "grad_norm": 0.75390625, + "learning_rate": 8.390645161290322e-05, + "loss": 0.1978, + "step": 10479 + }, + { + "epoch": 0.16768, + "grad_norm": 0.921875, + "learning_rate": 8.390483870967742e-05, + "loss": 0.137, + "step": 10480 + }, + { + "epoch": 0.167696, + "grad_norm": 0.69921875, + "learning_rate": 8.390322580645161e-05, + "loss": 0.1644, + "step": 10481 + }, + { + "epoch": 0.167712, + "grad_norm": 1.21875, + "learning_rate": 8.390161290322581e-05, + "loss": 0.2018, + "step": 10482 + }, + { + "epoch": 0.167728, + "grad_norm": 0.76953125, + "learning_rate": 8.39e-05, + "loss": 0.1609, + "step": 10483 + }, + { + "epoch": 0.167744, + "grad_norm": 0.95703125, + "learning_rate": 8.38983870967742e-05, + "loss": 0.187, + "step": 10484 + }, + { + "epoch": 0.16776, + "grad_norm": 0.6953125, + "learning_rate": 8.389677419354839e-05, + "loss": 0.1521, + "step": 10485 + }, + { + "epoch": 0.167776, + "grad_norm": 0.99609375, + "learning_rate": 8.389516129032259e-05, + "loss": 0.1702, + "step": 10486 + }, + { + "epoch": 0.167792, + "grad_norm": 0.7734375, + "learning_rate": 8.389354838709678e-05, + "loss": 0.1836, + "step": 10487 + }, + { + "epoch": 0.167808, + "grad_norm": 0.7890625, + "learning_rate": 8.389193548387098e-05, + "loss": 0.2259, + "step": 10488 + }, + { + "epoch": 0.167824, + "grad_norm": 0.9453125, + "learning_rate": 8.389032258064516e-05, + "loss": 0.1566, + "step": 10489 + }, + { + "epoch": 0.16784, + "grad_norm": 0.625, + "learning_rate": 8.388870967741936e-05, + "loss": 0.1746, + "step": 10490 + }, + { + "epoch": 0.167856, + "grad_norm": 0.58203125, + "learning_rate": 8.388709677419355e-05, + "loss": 0.1681, + "step": 10491 + }, + { + "epoch": 0.167872, + "grad_norm": 0.8359375, + "learning_rate": 8.388548387096774e-05, + "loss": 0.174, + "step": 10492 + }, + { + "epoch": 0.167888, + "grad_norm": 0.64453125, + "learning_rate": 8.388387096774194e-05, + "loss": 0.1953, + "step": 10493 + }, + { + "epoch": 0.167904, + "grad_norm": 0.8515625, + "learning_rate": 8.388225806451613e-05, + "loss": 0.2138, + "step": 10494 + }, + { + "epoch": 0.16792, + "grad_norm": 0.7265625, + "learning_rate": 8.388064516129033e-05, + "loss": 0.1789, + "step": 10495 + }, + { + "epoch": 0.167936, + "grad_norm": 0.83984375, + "learning_rate": 8.387903225806452e-05, + "loss": 0.1961, + "step": 10496 + }, + { + "epoch": 0.167952, + "grad_norm": 0.5703125, + "learning_rate": 8.387741935483872e-05, + "loss": 0.1418, + "step": 10497 + }, + { + "epoch": 0.167968, + "grad_norm": 0.73046875, + "learning_rate": 8.38758064516129e-05, + "loss": 0.2006, + "step": 10498 + }, + { + "epoch": 0.167984, + "grad_norm": 1.0, + "learning_rate": 8.38741935483871e-05, + "loss": 0.197, + "step": 10499 + }, + { + "epoch": 0.168, + "grad_norm": 0.5390625, + "learning_rate": 8.387258064516129e-05, + "loss": 0.144, + "step": 10500 + }, + { + "epoch": 0.168016, + "grad_norm": 0.83203125, + "learning_rate": 8.387096774193549e-05, + "loss": 0.1826, + "step": 10501 + }, + { + "epoch": 0.168032, + "grad_norm": 0.6015625, + "learning_rate": 8.386935483870968e-05, + "loss": 0.1719, + "step": 10502 + }, + { + "epoch": 0.168048, + "grad_norm": 0.76953125, + "learning_rate": 8.386774193548388e-05, + "loss": 0.2011, + "step": 10503 + }, + { + "epoch": 0.168064, + "grad_norm": 0.8046875, + "learning_rate": 8.386612903225806e-05, + "loss": 0.1606, + "step": 10504 + }, + { + "epoch": 0.16808, + "grad_norm": 0.76171875, + "learning_rate": 8.386451612903226e-05, + "loss": 0.1738, + "step": 10505 + }, + { + "epoch": 0.168096, + "grad_norm": 0.98046875, + "learning_rate": 8.386290322580645e-05, + "loss": 0.1914, + "step": 10506 + }, + { + "epoch": 0.168112, + "grad_norm": 0.58203125, + "learning_rate": 8.386129032258065e-05, + "loss": 0.1562, + "step": 10507 + }, + { + "epoch": 0.168128, + "grad_norm": 0.97265625, + "learning_rate": 8.385967741935485e-05, + "loss": 0.1874, + "step": 10508 + }, + { + "epoch": 0.168144, + "grad_norm": 0.91796875, + "learning_rate": 8.385806451612903e-05, + "loss": 0.1412, + "step": 10509 + }, + { + "epoch": 0.16816, + "grad_norm": 1.0, + "learning_rate": 8.385645161290323e-05, + "loss": 0.1823, + "step": 10510 + }, + { + "epoch": 0.168176, + "grad_norm": 0.62890625, + "learning_rate": 8.385483870967742e-05, + "loss": 0.196, + "step": 10511 + }, + { + "epoch": 0.168192, + "grad_norm": 1.0859375, + "learning_rate": 8.385322580645162e-05, + "loss": 0.1708, + "step": 10512 + }, + { + "epoch": 0.168208, + "grad_norm": 0.65234375, + "learning_rate": 8.38516129032258e-05, + "loss": 0.1669, + "step": 10513 + }, + { + "epoch": 0.168224, + "grad_norm": 0.8046875, + "learning_rate": 8.385e-05, + "loss": 0.2004, + "step": 10514 + }, + { + "epoch": 0.16824, + "grad_norm": 0.81640625, + "learning_rate": 8.384838709677419e-05, + "loss": 0.1563, + "step": 10515 + }, + { + "epoch": 0.168256, + "grad_norm": 0.73046875, + "learning_rate": 8.384677419354839e-05, + "loss": 0.1695, + "step": 10516 + }, + { + "epoch": 0.168272, + "grad_norm": 1.3203125, + "learning_rate": 8.384516129032258e-05, + "loss": 0.2057, + "step": 10517 + }, + { + "epoch": 0.168288, + "grad_norm": 0.66015625, + "learning_rate": 8.384354838709678e-05, + "loss": 0.1836, + "step": 10518 + }, + { + "epoch": 0.168304, + "grad_norm": 0.78515625, + "learning_rate": 8.384193548387098e-05, + "loss": 0.1845, + "step": 10519 + }, + { + "epoch": 0.16832, + "grad_norm": 0.62109375, + "learning_rate": 8.384032258064518e-05, + "loss": 0.1573, + "step": 10520 + }, + { + "epoch": 0.168336, + "grad_norm": 0.8046875, + "learning_rate": 8.383870967741936e-05, + "loss": 0.1811, + "step": 10521 + }, + { + "epoch": 0.168352, + "grad_norm": 0.71875, + "learning_rate": 8.383709677419356e-05, + "loss": 0.1788, + "step": 10522 + }, + { + "epoch": 0.168368, + "grad_norm": 0.7265625, + "learning_rate": 8.383548387096775e-05, + "loss": 0.1895, + "step": 10523 + }, + { + "epoch": 0.168384, + "grad_norm": 1.03125, + "learning_rate": 8.383387096774193e-05, + "loss": 0.191, + "step": 10524 + }, + { + "epoch": 0.1684, + "grad_norm": 0.73046875, + "learning_rate": 8.383225806451613e-05, + "loss": 0.1614, + "step": 10525 + }, + { + "epoch": 0.168416, + "grad_norm": 0.546875, + "learning_rate": 8.383064516129032e-05, + "loss": 0.1556, + "step": 10526 + }, + { + "epoch": 0.168432, + "grad_norm": 0.74609375, + "learning_rate": 8.382903225806452e-05, + "loss": 0.1721, + "step": 10527 + }, + { + "epoch": 0.168448, + "grad_norm": 0.77734375, + "learning_rate": 8.38274193548387e-05, + "loss": 0.1543, + "step": 10528 + }, + { + "epoch": 0.168464, + "grad_norm": 0.7265625, + "learning_rate": 8.38258064516129e-05, + "loss": 0.1487, + "step": 10529 + }, + { + "epoch": 0.16848, + "grad_norm": 0.6875, + "learning_rate": 8.38241935483871e-05, + "loss": 0.1396, + "step": 10530 + }, + { + "epoch": 0.168496, + "grad_norm": 0.703125, + "learning_rate": 8.382258064516129e-05, + "loss": 0.2166, + "step": 10531 + }, + { + "epoch": 0.168512, + "grad_norm": 0.71484375, + "learning_rate": 8.382096774193549e-05, + "loss": 0.1317, + "step": 10532 + }, + { + "epoch": 0.168528, + "grad_norm": 0.93359375, + "learning_rate": 8.381935483870969e-05, + "loss": 0.1935, + "step": 10533 + }, + { + "epoch": 0.168544, + "grad_norm": 0.765625, + "learning_rate": 8.381774193548388e-05, + "loss": 0.185, + "step": 10534 + }, + { + "epoch": 0.16856, + "grad_norm": 1.1171875, + "learning_rate": 8.381612903225808e-05, + "loss": 0.1765, + "step": 10535 + }, + { + "epoch": 0.168576, + "grad_norm": 0.81640625, + "learning_rate": 8.381451612903226e-05, + "loss": 0.2253, + "step": 10536 + }, + { + "epoch": 0.168592, + "grad_norm": 0.8203125, + "learning_rate": 8.381290322580646e-05, + "loss": 0.1387, + "step": 10537 + }, + { + "epoch": 0.168608, + "grad_norm": 0.60546875, + "learning_rate": 8.381129032258065e-05, + "loss": 0.1395, + "step": 10538 + }, + { + "epoch": 0.168624, + "grad_norm": 0.80078125, + "learning_rate": 8.380967741935483e-05, + "loss": 0.167, + "step": 10539 + }, + { + "epoch": 0.16864, + "grad_norm": 0.74609375, + "learning_rate": 8.380806451612903e-05, + "loss": 0.1875, + "step": 10540 + }, + { + "epoch": 0.168656, + "grad_norm": 0.70703125, + "learning_rate": 8.380645161290322e-05, + "loss": 0.17, + "step": 10541 + }, + { + "epoch": 0.168672, + "grad_norm": 0.6328125, + "learning_rate": 8.380483870967742e-05, + "loss": 0.1564, + "step": 10542 + }, + { + "epoch": 0.168688, + "grad_norm": 0.828125, + "learning_rate": 8.380322580645162e-05, + "loss": 0.172, + "step": 10543 + }, + { + "epoch": 0.168704, + "grad_norm": 0.98828125, + "learning_rate": 8.380161290322582e-05, + "loss": 0.2634, + "step": 10544 + }, + { + "epoch": 0.16872, + "grad_norm": 0.87109375, + "learning_rate": 8.38e-05, + "loss": 0.1657, + "step": 10545 + }, + { + "epoch": 0.168736, + "grad_norm": 1.390625, + "learning_rate": 8.37983870967742e-05, + "loss": 0.22, + "step": 10546 + }, + { + "epoch": 0.168752, + "grad_norm": 0.5546875, + "learning_rate": 8.379677419354839e-05, + "loss": 0.1659, + "step": 10547 + }, + { + "epoch": 0.168768, + "grad_norm": 0.83984375, + "learning_rate": 8.379516129032259e-05, + "loss": 0.1746, + "step": 10548 + }, + { + "epoch": 0.168784, + "grad_norm": 1.2421875, + "learning_rate": 8.379354838709678e-05, + "loss": 0.2318, + "step": 10549 + }, + { + "epoch": 0.1688, + "grad_norm": 0.73828125, + "learning_rate": 8.379193548387098e-05, + "loss": 0.2091, + "step": 10550 + }, + { + "epoch": 0.168816, + "grad_norm": 0.9765625, + "learning_rate": 8.379032258064516e-05, + "loss": 0.1515, + "step": 10551 + }, + { + "epoch": 0.168832, + "grad_norm": 0.953125, + "learning_rate": 8.378870967741936e-05, + "loss": 0.2189, + "step": 10552 + }, + { + "epoch": 0.168848, + "grad_norm": 1.3359375, + "learning_rate": 8.378709677419355e-05, + "loss": 0.1343, + "step": 10553 + }, + { + "epoch": 0.168864, + "grad_norm": 0.79296875, + "learning_rate": 8.378548387096775e-05, + "loss": 0.1677, + "step": 10554 + }, + { + "epoch": 0.16888, + "grad_norm": 0.65234375, + "learning_rate": 8.378387096774195e-05, + "loss": 0.1742, + "step": 10555 + }, + { + "epoch": 0.168896, + "grad_norm": 0.671875, + "learning_rate": 8.378225806451613e-05, + "loss": 0.2001, + "step": 10556 + }, + { + "epoch": 0.168912, + "grad_norm": 0.8515625, + "learning_rate": 8.378064516129033e-05, + "loss": 0.157, + "step": 10557 + }, + { + "epoch": 0.168928, + "grad_norm": 0.74609375, + "learning_rate": 8.377903225806452e-05, + "loss": 0.1896, + "step": 10558 + }, + { + "epoch": 0.168944, + "grad_norm": 0.83203125, + "learning_rate": 8.377741935483872e-05, + "loss": 0.1721, + "step": 10559 + }, + { + "epoch": 0.16896, + "grad_norm": 0.83203125, + "learning_rate": 8.37758064516129e-05, + "loss": 0.1658, + "step": 10560 + }, + { + "epoch": 0.168976, + "grad_norm": 0.546875, + "learning_rate": 8.37741935483871e-05, + "loss": 0.1677, + "step": 10561 + }, + { + "epoch": 0.168992, + "grad_norm": 0.8359375, + "learning_rate": 8.377258064516129e-05, + "loss": 0.1955, + "step": 10562 + }, + { + "epoch": 0.169008, + "grad_norm": 0.458984375, + "learning_rate": 8.377096774193549e-05, + "loss": 0.1471, + "step": 10563 + }, + { + "epoch": 0.169024, + "grad_norm": 0.69921875, + "learning_rate": 8.376935483870968e-05, + "loss": 0.1641, + "step": 10564 + }, + { + "epoch": 0.16904, + "grad_norm": 1.390625, + "learning_rate": 8.376774193548387e-05, + "loss": 0.2091, + "step": 10565 + }, + { + "epoch": 0.169056, + "grad_norm": 1.1328125, + "learning_rate": 8.376612903225806e-05, + "loss": 0.2098, + "step": 10566 + }, + { + "epoch": 0.169072, + "grad_norm": 0.8359375, + "learning_rate": 8.376451612903226e-05, + "loss": 0.1745, + "step": 10567 + }, + { + "epoch": 0.169088, + "grad_norm": 0.71484375, + "learning_rate": 8.376290322580646e-05, + "loss": 0.1793, + "step": 10568 + }, + { + "epoch": 0.169104, + "grad_norm": 0.62109375, + "learning_rate": 8.376129032258066e-05, + "loss": 0.1634, + "step": 10569 + }, + { + "epoch": 0.16912, + "grad_norm": 1.1328125, + "learning_rate": 8.375967741935485e-05, + "loss": 0.2077, + "step": 10570 + }, + { + "epoch": 0.169136, + "grad_norm": 0.99609375, + "learning_rate": 8.375806451612903e-05, + "loss": 0.1521, + "step": 10571 + }, + { + "epoch": 0.169152, + "grad_norm": 0.83984375, + "learning_rate": 8.375645161290323e-05, + "loss": 0.1995, + "step": 10572 + }, + { + "epoch": 0.169168, + "grad_norm": 0.8515625, + "learning_rate": 8.375483870967742e-05, + "loss": 0.1272, + "step": 10573 + }, + { + "epoch": 0.169184, + "grad_norm": 0.70703125, + "learning_rate": 8.375322580645162e-05, + "loss": 0.1555, + "step": 10574 + }, + { + "epoch": 0.1692, + "grad_norm": 0.8046875, + "learning_rate": 8.37516129032258e-05, + "loss": 0.2227, + "step": 10575 + }, + { + "epoch": 0.169216, + "grad_norm": 0.62890625, + "learning_rate": 8.375e-05, + "loss": 0.1942, + "step": 10576 + }, + { + "epoch": 0.169232, + "grad_norm": 0.87890625, + "learning_rate": 8.374838709677419e-05, + "loss": 0.221, + "step": 10577 + }, + { + "epoch": 0.169248, + "grad_norm": 0.9296875, + "learning_rate": 8.374677419354839e-05, + "loss": 0.1607, + "step": 10578 + }, + { + "epoch": 0.169264, + "grad_norm": 0.62109375, + "learning_rate": 8.374516129032259e-05, + "loss": 0.1736, + "step": 10579 + }, + { + "epoch": 0.16928, + "grad_norm": 0.71484375, + "learning_rate": 8.374354838709679e-05, + "loss": 0.1823, + "step": 10580 + }, + { + "epoch": 0.169296, + "grad_norm": 0.93359375, + "learning_rate": 8.374193548387097e-05, + "loss": 0.1656, + "step": 10581 + }, + { + "epoch": 0.169312, + "grad_norm": 0.67578125, + "learning_rate": 8.374032258064517e-05, + "loss": 0.1345, + "step": 10582 + }, + { + "epoch": 0.169328, + "grad_norm": 0.72265625, + "learning_rate": 8.373870967741936e-05, + "loss": 0.1426, + "step": 10583 + }, + { + "epoch": 0.169344, + "grad_norm": 1.6328125, + "learning_rate": 8.373709677419356e-05, + "loss": 0.2093, + "step": 10584 + }, + { + "epoch": 0.16936, + "grad_norm": 0.62109375, + "learning_rate": 8.373548387096775e-05, + "loss": 0.1497, + "step": 10585 + }, + { + "epoch": 0.169376, + "grad_norm": 1.2734375, + "learning_rate": 8.373387096774193e-05, + "loss": 0.1516, + "step": 10586 + }, + { + "epoch": 0.169392, + "grad_norm": 1.6953125, + "learning_rate": 8.373225806451613e-05, + "loss": 0.1776, + "step": 10587 + }, + { + "epoch": 0.169408, + "grad_norm": 1.0234375, + "learning_rate": 8.373064516129032e-05, + "loss": 0.2166, + "step": 10588 + }, + { + "epoch": 0.169424, + "grad_norm": 1.2734375, + "learning_rate": 8.372903225806452e-05, + "loss": 0.2034, + "step": 10589 + }, + { + "epoch": 0.16944, + "grad_norm": 0.69921875, + "learning_rate": 8.372741935483872e-05, + "loss": 0.1902, + "step": 10590 + }, + { + "epoch": 0.169456, + "grad_norm": 1.3828125, + "learning_rate": 8.372580645161292e-05, + "loss": 0.2083, + "step": 10591 + }, + { + "epoch": 0.169472, + "grad_norm": 0.703125, + "learning_rate": 8.37241935483871e-05, + "loss": 0.1749, + "step": 10592 + }, + { + "epoch": 0.169488, + "grad_norm": 0.56640625, + "learning_rate": 8.37225806451613e-05, + "loss": 0.1303, + "step": 10593 + }, + { + "epoch": 0.169504, + "grad_norm": 1.15625, + "learning_rate": 8.372096774193549e-05, + "loss": 0.1794, + "step": 10594 + }, + { + "epoch": 0.16952, + "grad_norm": 0.90625, + "learning_rate": 8.371935483870969e-05, + "loss": 0.1846, + "step": 10595 + }, + { + "epoch": 0.169536, + "grad_norm": 0.5859375, + "learning_rate": 8.371774193548387e-05, + "loss": 0.1822, + "step": 10596 + }, + { + "epoch": 0.169552, + "grad_norm": 0.53515625, + "learning_rate": 8.371612903225807e-05, + "loss": 0.1463, + "step": 10597 + }, + { + "epoch": 0.169568, + "grad_norm": 0.91796875, + "learning_rate": 8.371451612903226e-05, + "loss": 0.1447, + "step": 10598 + }, + { + "epoch": 0.169584, + "grad_norm": 0.60546875, + "learning_rate": 8.371290322580646e-05, + "loss": 0.1676, + "step": 10599 + }, + { + "epoch": 0.1696, + "grad_norm": 0.91015625, + "learning_rate": 8.371129032258064e-05, + "loss": 0.1722, + "step": 10600 + }, + { + "epoch": 0.169616, + "grad_norm": 1.1328125, + "learning_rate": 8.370967741935483e-05, + "loss": 0.1987, + "step": 10601 + }, + { + "epoch": 0.169632, + "grad_norm": 0.8359375, + "learning_rate": 8.370806451612903e-05, + "loss": 0.1929, + "step": 10602 + }, + { + "epoch": 0.169648, + "grad_norm": 0.75, + "learning_rate": 8.370645161290323e-05, + "loss": 0.1824, + "step": 10603 + }, + { + "epoch": 0.169664, + "grad_norm": 0.78515625, + "learning_rate": 8.370483870967743e-05, + "loss": 0.1742, + "step": 10604 + }, + { + "epoch": 0.16968, + "grad_norm": 0.65625, + "learning_rate": 8.370322580645162e-05, + "loss": 0.1874, + "step": 10605 + }, + { + "epoch": 0.169696, + "grad_norm": 0.92578125, + "learning_rate": 8.370161290322582e-05, + "loss": 0.2146, + "step": 10606 + }, + { + "epoch": 0.169712, + "grad_norm": 0.57421875, + "learning_rate": 8.37e-05, + "loss": 0.1682, + "step": 10607 + }, + { + "epoch": 0.169728, + "grad_norm": 0.52734375, + "learning_rate": 8.36983870967742e-05, + "loss": 0.1566, + "step": 10608 + }, + { + "epoch": 0.169744, + "grad_norm": 0.66796875, + "learning_rate": 8.369677419354839e-05, + "loss": 0.1803, + "step": 10609 + }, + { + "epoch": 0.16976, + "grad_norm": 0.6328125, + "learning_rate": 8.369516129032259e-05, + "loss": 0.1735, + "step": 10610 + }, + { + "epoch": 0.169776, + "grad_norm": 0.546875, + "learning_rate": 8.369354838709677e-05, + "loss": 0.1667, + "step": 10611 + }, + { + "epoch": 0.169792, + "grad_norm": 0.7109375, + "learning_rate": 8.369193548387097e-05, + "loss": 0.1857, + "step": 10612 + }, + { + "epoch": 0.169808, + "grad_norm": 0.6484375, + "learning_rate": 8.369032258064516e-05, + "loss": 0.1873, + "step": 10613 + }, + { + "epoch": 0.169824, + "grad_norm": 1.1015625, + "learning_rate": 8.368870967741936e-05, + "loss": 0.168, + "step": 10614 + }, + { + "epoch": 0.16984, + "grad_norm": 0.640625, + "learning_rate": 8.368709677419356e-05, + "loss": 0.1609, + "step": 10615 + }, + { + "epoch": 0.169856, + "grad_norm": 0.59765625, + "learning_rate": 8.368548387096776e-05, + "loss": 0.1304, + "step": 10616 + }, + { + "epoch": 0.169872, + "grad_norm": 0.482421875, + "learning_rate": 8.368387096774194e-05, + "loss": 0.1653, + "step": 10617 + }, + { + "epoch": 0.169888, + "grad_norm": 0.8359375, + "learning_rate": 8.368225806451613e-05, + "loss": 0.1131, + "step": 10618 + }, + { + "epoch": 0.169904, + "grad_norm": 0.87109375, + "learning_rate": 8.368064516129033e-05, + "loss": 0.1775, + "step": 10619 + }, + { + "epoch": 0.16992, + "grad_norm": 1.1328125, + "learning_rate": 8.367903225806452e-05, + "loss": 0.1556, + "step": 10620 + }, + { + "epoch": 0.169936, + "grad_norm": 0.87109375, + "learning_rate": 8.367741935483872e-05, + "loss": 0.1683, + "step": 10621 + }, + { + "epoch": 0.169952, + "grad_norm": 0.671875, + "learning_rate": 8.36758064516129e-05, + "loss": 0.1539, + "step": 10622 + }, + { + "epoch": 0.169968, + "grad_norm": 0.890625, + "learning_rate": 8.36741935483871e-05, + "loss": 0.1967, + "step": 10623 + }, + { + "epoch": 0.169984, + "grad_norm": 0.63671875, + "learning_rate": 8.367258064516129e-05, + "loss": 0.1984, + "step": 10624 + }, + { + "epoch": 0.17, + "grad_norm": 0.82421875, + "learning_rate": 8.367096774193549e-05, + "loss": 0.1903, + "step": 10625 + }, + { + "epoch": 0.170016, + "grad_norm": 0.79296875, + "learning_rate": 8.366935483870967e-05, + "loss": 0.1907, + "step": 10626 + }, + { + "epoch": 0.170032, + "grad_norm": 0.91015625, + "learning_rate": 8.366774193548387e-05, + "loss": 0.1885, + "step": 10627 + }, + { + "epoch": 0.170048, + "grad_norm": 0.8984375, + "learning_rate": 8.366612903225807e-05, + "loss": 0.1763, + "step": 10628 + }, + { + "epoch": 0.170064, + "grad_norm": 0.7109375, + "learning_rate": 8.366451612903227e-05, + "loss": 0.1988, + "step": 10629 + }, + { + "epoch": 0.17008, + "grad_norm": 0.85546875, + "learning_rate": 8.366290322580646e-05, + "loss": 0.1883, + "step": 10630 + }, + { + "epoch": 0.170096, + "grad_norm": 0.7265625, + "learning_rate": 8.366129032258066e-05, + "loss": 0.1247, + "step": 10631 + }, + { + "epoch": 0.170112, + "grad_norm": 1.0703125, + "learning_rate": 8.365967741935484e-05, + "loss": 0.1452, + "step": 10632 + }, + { + "epoch": 0.170128, + "grad_norm": 0.65234375, + "learning_rate": 8.365806451612903e-05, + "loss": 0.195, + "step": 10633 + }, + { + "epoch": 0.170144, + "grad_norm": 0.7109375, + "learning_rate": 8.365645161290323e-05, + "loss": 0.1772, + "step": 10634 + }, + { + "epoch": 0.17016, + "grad_norm": 0.9375, + "learning_rate": 8.365483870967742e-05, + "loss": 0.1565, + "step": 10635 + }, + { + "epoch": 0.170176, + "grad_norm": 0.66796875, + "learning_rate": 8.365322580645161e-05, + "loss": 0.1528, + "step": 10636 + }, + { + "epoch": 0.170192, + "grad_norm": 0.61328125, + "learning_rate": 8.36516129032258e-05, + "loss": 0.1667, + "step": 10637 + }, + { + "epoch": 0.170208, + "grad_norm": 1.203125, + "learning_rate": 8.365e-05, + "loss": 0.2129, + "step": 10638 + }, + { + "epoch": 0.170224, + "grad_norm": 1.1875, + "learning_rate": 8.36483870967742e-05, + "loss": 0.1564, + "step": 10639 + }, + { + "epoch": 0.17024, + "grad_norm": 0.9921875, + "learning_rate": 8.36467741935484e-05, + "loss": 0.2154, + "step": 10640 + }, + { + "epoch": 0.170256, + "grad_norm": 0.71484375, + "learning_rate": 8.364516129032259e-05, + "loss": 0.1271, + "step": 10641 + }, + { + "epoch": 0.170272, + "grad_norm": 0.58984375, + "learning_rate": 8.364354838709679e-05, + "loss": 0.1695, + "step": 10642 + }, + { + "epoch": 0.170288, + "grad_norm": 1.3046875, + "learning_rate": 8.364193548387097e-05, + "loss": 0.1582, + "step": 10643 + }, + { + "epoch": 0.170304, + "grad_norm": 1.0859375, + "learning_rate": 8.364032258064517e-05, + "loss": 0.1859, + "step": 10644 + }, + { + "epoch": 0.17032, + "grad_norm": 0.73828125, + "learning_rate": 8.363870967741936e-05, + "loss": 0.1542, + "step": 10645 + }, + { + "epoch": 0.170336, + "grad_norm": 0.61328125, + "learning_rate": 8.363709677419356e-05, + "loss": 0.1568, + "step": 10646 + }, + { + "epoch": 0.170352, + "grad_norm": 0.703125, + "learning_rate": 8.363548387096774e-05, + "loss": 0.1564, + "step": 10647 + }, + { + "epoch": 0.170368, + "grad_norm": 0.7265625, + "learning_rate": 8.363387096774193e-05, + "loss": 0.1509, + "step": 10648 + }, + { + "epoch": 0.170384, + "grad_norm": 1.2265625, + "learning_rate": 8.363225806451613e-05, + "loss": 0.1886, + "step": 10649 + }, + { + "epoch": 0.1704, + "grad_norm": 1.0859375, + "learning_rate": 8.363064516129033e-05, + "loss": 0.1702, + "step": 10650 + }, + { + "epoch": 0.170416, + "grad_norm": 0.70703125, + "learning_rate": 8.362903225806453e-05, + "loss": 0.1364, + "step": 10651 + }, + { + "epoch": 0.170432, + "grad_norm": 0.56640625, + "learning_rate": 8.362741935483871e-05, + "loss": 0.1443, + "step": 10652 + }, + { + "epoch": 0.170448, + "grad_norm": 1.5546875, + "learning_rate": 8.362580645161291e-05, + "loss": 0.1608, + "step": 10653 + }, + { + "epoch": 0.170464, + "grad_norm": 0.6015625, + "learning_rate": 8.36241935483871e-05, + "loss": 0.1686, + "step": 10654 + }, + { + "epoch": 0.17048, + "grad_norm": 0.8203125, + "learning_rate": 8.36225806451613e-05, + "loss": 0.1971, + "step": 10655 + }, + { + "epoch": 0.170496, + "grad_norm": 1.1171875, + "learning_rate": 8.362096774193549e-05, + "loss": 0.1898, + "step": 10656 + }, + { + "epoch": 0.170512, + "grad_norm": 0.89453125, + "learning_rate": 8.361935483870968e-05, + "loss": 0.175, + "step": 10657 + }, + { + "epoch": 0.170528, + "grad_norm": 1.25, + "learning_rate": 8.361774193548387e-05, + "loss": 0.1764, + "step": 10658 + }, + { + "epoch": 0.170544, + "grad_norm": 0.83203125, + "learning_rate": 8.361612903225807e-05, + "loss": 0.1981, + "step": 10659 + }, + { + "epoch": 0.17056, + "grad_norm": 0.8828125, + "learning_rate": 8.361451612903226e-05, + "loss": 0.1799, + "step": 10660 + }, + { + "epoch": 0.170576, + "grad_norm": 0.86328125, + "learning_rate": 8.361290322580646e-05, + "loss": 0.1737, + "step": 10661 + }, + { + "epoch": 0.170592, + "grad_norm": 1.4375, + "learning_rate": 8.361129032258064e-05, + "loss": 0.1691, + "step": 10662 + }, + { + "epoch": 0.170608, + "grad_norm": 0.734375, + "learning_rate": 8.360967741935484e-05, + "loss": 0.1614, + "step": 10663 + }, + { + "epoch": 0.170624, + "grad_norm": 1.28125, + "learning_rate": 8.360806451612904e-05, + "loss": 0.1737, + "step": 10664 + }, + { + "epoch": 0.17064, + "grad_norm": 0.98046875, + "learning_rate": 8.360645161290323e-05, + "loss": 0.2298, + "step": 10665 + }, + { + "epoch": 0.170656, + "grad_norm": 0.58984375, + "learning_rate": 8.360483870967743e-05, + "loss": 0.1808, + "step": 10666 + }, + { + "epoch": 0.170672, + "grad_norm": 0.63671875, + "learning_rate": 8.360322580645161e-05, + "loss": 0.1628, + "step": 10667 + }, + { + "epoch": 0.170688, + "grad_norm": 0.7890625, + "learning_rate": 8.360161290322581e-05, + "loss": 0.2042, + "step": 10668 + }, + { + "epoch": 0.170704, + "grad_norm": 0.79296875, + "learning_rate": 8.36e-05, + "loss": 0.1678, + "step": 10669 + }, + { + "epoch": 0.17072, + "grad_norm": 1.171875, + "learning_rate": 8.35983870967742e-05, + "loss": 0.1855, + "step": 10670 + }, + { + "epoch": 0.170736, + "grad_norm": 0.76171875, + "learning_rate": 8.359677419354838e-05, + "loss": 0.1678, + "step": 10671 + }, + { + "epoch": 0.170752, + "grad_norm": 1.71875, + "learning_rate": 8.359516129032258e-05, + "loss": 0.1838, + "step": 10672 + }, + { + "epoch": 0.170768, + "grad_norm": 0.7265625, + "learning_rate": 8.359354838709677e-05, + "loss": 0.1385, + "step": 10673 + }, + { + "epoch": 0.170784, + "grad_norm": 0.6015625, + "learning_rate": 8.359193548387097e-05, + "loss": 0.1742, + "step": 10674 + }, + { + "epoch": 0.1708, + "grad_norm": 0.5625, + "learning_rate": 8.359032258064517e-05, + "loss": 0.1421, + "step": 10675 + }, + { + "epoch": 0.170816, + "grad_norm": 0.71484375, + "learning_rate": 8.358870967741937e-05, + "loss": 0.152, + "step": 10676 + }, + { + "epoch": 0.170832, + "grad_norm": 0.828125, + "learning_rate": 8.358709677419356e-05, + "loss": 0.1759, + "step": 10677 + }, + { + "epoch": 0.170848, + "grad_norm": 0.62109375, + "learning_rate": 8.358548387096776e-05, + "loss": 0.1669, + "step": 10678 + }, + { + "epoch": 0.170864, + "grad_norm": 0.6640625, + "learning_rate": 8.358387096774194e-05, + "loss": 0.216, + "step": 10679 + }, + { + "epoch": 0.17088, + "grad_norm": 1.0078125, + "learning_rate": 8.358225806451613e-05, + "loss": 0.1658, + "step": 10680 + }, + { + "epoch": 0.170896, + "grad_norm": 0.921875, + "learning_rate": 8.358064516129033e-05, + "loss": 0.1747, + "step": 10681 + }, + { + "epoch": 0.170912, + "grad_norm": 1.03125, + "learning_rate": 8.357903225806451e-05, + "loss": 0.2073, + "step": 10682 + }, + { + "epoch": 0.170928, + "grad_norm": 1.3671875, + "learning_rate": 8.357741935483871e-05, + "loss": 0.1968, + "step": 10683 + }, + { + "epoch": 0.170944, + "grad_norm": 1.25, + "learning_rate": 8.35758064516129e-05, + "loss": 0.1685, + "step": 10684 + }, + { + "epoch": 0.17096, + "grad_norm": 0.6328125, + "learning_rate": 8.35741935483871e-05, + "loss": 0.1618, + "step": 10685 + }, + { + "epoch": 0.170976, + "grad_norm": 0.76171875, + "learning_rate": 8.35725806451613e-05, + "loss": 0.1423, + "step": 10686 + }, + { + "epoch": 0.170992, + "grad_norm": 1.0703125, + "learning_rate": 8.357096774193548e-05, + "loss": 0.1647, + "step": 10687 + }, + { + "epoch": 0.171008, + "grad_norm": 0.546875, + "learning_rate": 8.356935483870968e-05, + "loss": 0.1629, + "step": 10688 + }, + { + "epoch": 0.171024, + "grad_norm": 0.77734375, + "learning_rate": 8.356774193548388e-05, + "loss": 0.1753, + "step": 10689 + }, + { + "epoch": 0.17104, + "grad_norm": 0.96875, + "learning_rate": 8.356612903225807e-05, + "loss": 0.1621, + "step": 10690 + }, + { + "epoch": 0.171056, + "grad_norm": 0.83203125, + "learning_rate": 8.356451612903227e-05, + "loss": 0.1503, + "step": 10691 + }, + { + "epoch": 0.171072, + "grad_norm": 0.6328125, + "learning_rate": 8.356290322580646e-05, + "loss": 0.1912, + "step": 10692 + }, + { + "epoch": 0.171088, + "grad_norm": 1.1796875, + "learning_rate": 8.356129032258065e-05, + "loss": 0.1848, + "step": 10693 + }, + { + "epoch": 0.171104, + "grad_norm": 0.58984375, + "learning_rate": 8.355967741935484e-05, + "loss": 0.1968, + "step": 10694 + }, + { + "epoch": 0.17112, + "grad_norm": 0.921875, + "learning_rate": 8.355806451612903e-05, + "loss": 0.1532, + "step": 10695 + }, + { + "epoch": 0.171136, + "grad_norm": 0.5390625, + "learning_rate": 8.355645161290323e-05, + "loss": 0.1496, + "step": 10696 + }, + { + "epoch": 0.171152, + "grad_norm": 1.0859375, + "learning_rate": 8.355483870967741e-05, + "loss": 0.1959, + "step": 10697 + }, + { + "epoch": 0.171168, + "grad_norm": 0.75390625, + "learning_rate": 8.355322580645161e-05, + "loss": 0.1709, + "step": 10698 + }, + { + "epoch": 0.171184, + "grad_norm": 0.66796875, + "learning_rate": 8.355161290322581e-05, + "loss": 0.1282, + "step": 10699 + }, + { + "epoch": 0.1712, + "grad_norm": 0.73046875, + "learning_rate": 8.355000000000001e-05, + "loss": 0.1739, + "step": 10700 + }, + { + "epoch": 0.171216, + "grad_norm": 0.83203125, + "learning_rate": 8.35483870967742e-05, + "loss": 0.1291, + "step": 10701 + }, + { + "epoch": 0.171232, + "grad_norm": 1.140625, + "learning_rate": 8.35467741935484e-05, + "loss": 0.1737, + "step": 10702 + }, + { + "epoch": 0.171248, + "grad_norm": 0.97265625, + "learning_rate": 8.354516129032258e-05, + "loss": 0.2048, + "step": 10703 + }, + { + "epoch": 0.171264, + "grad_norm": 0.6796875, + "learning_rate": 8.354354838709678e-05, + "loss": 0.1995, + "step": 10704 + }, + { + "epoch": 0.17128, + "grad_norm": 0.90625, + "learning_rate": 8.354193548387097e-05, + "loss": 0.1261, + "step": 10705 + }, + { + "epoch": 0.171296, + "grad_norm": 0.671875, + "learning_rate": 8.354032258064517e-05, + "loss": 0.1607, + "step": 10706 + }, + { + "epoch": 0.171312, + "grad_norm": 0.921875, + "learning_rate": 8.353870967741935e-05, + "loss": 0.1814, + "step": 10707 + }, + { + "epoch": 0.171328, + "grad_norm": 0.70703125, + "learning_rate": 8.353709677419355e-05, + "loss": 0.1863, + "step": 10708 + }, + { + "epoch": 0.171344, + "grad_norm": 0.5859375, + "learning_rate": 8.353548387096774e-05, + "loss": 0.1558, + "step": 10709 + }, + { + "epoch": 0.17136, + "grad_norm": 1.3203125, + "learning_rate": 8.353387096774194e-05, + "loss": 0.1777, + "step": 10710 + }, + { + "epoch": 0.171376, + "grad_norm": 0.7890625, + "learning_rate": 8.353225806451614e-05, + "loss": 0.1753, + "step": 10711 + }, + { + "epoch": 0.171392, + "grad_norm": 0.94140625, + "learning_rate": 8.353064516129033e-05, + "loss": 0.1998, + "step": 10712 + }, + { + "epoch": 0.171408, + "grad_norm": 1.015625, + "learning_rate": 8.352903225806453e-05, + "loss": 0.1761, + "step": 10713 + }, + { + "epoch": 0.171424, + "grad_norm": 1.2421875, + "learning_rate": 8.352741935483871e-05, + "loss": 0.1898, + "step": 10714 + }, + { + "epoch": 0.17144, + "grad_norm": 1.21875, + "learning_rate": 8.352580645161291e-05, + "loss": 0.1882, + "step": 10715 + }, + { + "epoch": 0.171456, + "grad_norm": 1.25, + "learning_rate": 8.35241935483871e-05, + "loss": 0.1929, + "step": 10716 + }, + { + "epoch": 0.171472, + "grad_norm": 1.3671875, + "learning_rate": 8.35225806451613e-05, + "loss": 0.1816, + "step": 10717 + }, + { + "epoch": 0.171488, + "grad_norm": 0.87890625, + "learning_rate": 8.352096774193548e-05, + "loss": 0.1597, + "step": 10718 + }, + { + "epoch": 0.171504, + "grad_norm": 0.55078125, + "learning_rate": 8.351935483870968e-05, + "loss": 0.1601, + "step": 10719 + }, + { + "epoch": 0.17152, + "grad_norm": 0.66015625, + "learning_rate": 8.351774193548387e-05, + "loss": 0.1679, + "step": 10720 + }, + { + "epoch": 0.171536, + "grad_norm": 0.7734375, + "learning_rate": 8.351612903225807e-05, + "loss": 0.1546, + "step": 10721 + }, + { + "epoch": 0.171552, + "grad_norm": 0.609375, + "learning_rate": 8.351451612903225e-05, + "loss": 0.2147, + "step": 10722 + }, + { + "epoch": 0.171568, + "grad_norm": 1.0078125, + "learning_rate": 8.351290322580645e-05, + "loss": 0.2524, + "step": 10723 + }, + { + "epoch": 0.171584, + "grad_norm": 1.0703125, + "learning_rate": 8.351129032258065e-05, + "loss": 0.1365, + "step": 10724 + }, + { + "epoch": 0.1716, + "grad_norm": 0.65625, + "learning_rate": 8.350967741935485e-05, + "loss": 0.1485, + "step": 10725 + }, + { + "epoch": 0.171616, + "grad_norm": 0.80859375, + "learning_rate": 8.350806451612904e-05, + "loss": 0.2016, + "step": 10726 + }, + { + "epoch": 0.171632, + "grad_norm": 0.82421875, + "learning_rate": 8.350645161290323e-05, + "loss": 0.1516, + "step": 10727 + }, + { + "epoch": 0.171648, + "grad_norm": 0.86328125, + "learning_rate": 8.350483870967742e-05, + "loss": 0.1887, + "step": 10728 + }, + { + "epoch": 0.171664, + "grad_norm": 0.7578125, + "learning_rate": 8.350322580645161e-05, + "loss": 0.1635, + "step": 10729 + }, + { + "epoch": 0.17168, + "grad_norm": 0.87890625, + "learning_rate": 8.350161290322581e-05, + "loss": 0.1442, + "step": 10730 + }, + { + "epoch": 0.171696, + "grad_norm": 0.74609375, + "learning_rate": 8.35e-05, + "loss": 0.1817, + "step": 10731 + }, + { + "epoch": 0.171712, + "grad_norm": 0.73046875, + "learning_rate": 8.34983870967742e-05, + "loss": 0.1508, + "step": 10732 + }, + { + "epoch": 0.171728, + "grad_norm": 1.1796875, + "learning_rate": 8.349677419354838e-05, + "loss": 0.1992, + "step": 10733 + }, + { + "epoch": 0.171744, + "grad_norm": 0.765625, + "learning_rate": 8.349516129032258e-05, + "loss": 0.1947, + "step": 10734 + }, + { + "epoch": 0.17176, + "grad_norm": 0.9296875, + "learning_rate": 8.349354838709678e-05, + "loss": 0.1488, + "step": 10735 + }, + { + "epoch": 0.171776, + "grad_norm": 1.1328125, + "learning_rate": 8.349193548387098e-05, + "loss": 0.1838, + "step": 10736 + }, + { + "epoch": 0.171792, + "grad_norm": 0.91015625, + "learning_rate": 8.349032258064517e-05, + "loss": 0.1885, + "step": 10737 + }, + { + "epoch": 0.171808, + "grad_norm": 0.97265625, + "learning_rate": 8.348870967741937e-05, + "loss": 0.1667, + "step": 10738 + }, + { + "epoch": 0.171824, + "grad_norm": 0.498046875, + "learning_rate": 8.348709677419355e-05, + "loss": 0.1473, + "step": 10739 + }, + { + "epoch": 0.17184, + "grad_norm": 0.87890625, + "learning_rate": 8.348548387096775e-05, + "loss": 0.1919, + "step": 10740 + }, + { + "epoch": 0.171856, + "grad_norm": 0.64453125, + "learning_rate": 8.348387096774194e-05, + "loss": 0.1827, + "step": 10741 + }, + { + "epoch": 0.171872, + "grad_norm": 0.515625, + "learning_rate": 8.348225806451612e-05, + "loss": 0.1441, + "step": 10742 + }, + { + "epoch": 0.171888, + "grad_norm": 0.6015625, + "learning_rate": 8.348064516129032e-05, + "loss": 0.1869, + "step": 10743 + }, + { + "epoch": 0.171904, + "grad_norm": 0.9765625, + "learning_rate": 8.347903225806451e-05, + "loss": 0.2025, + "step": 10744 + }, + { + "epoch": 0.17192, + "grad_norm": 0.65234375, + "learning_rate": 8.347741935483871e-05, + "loss": 0.187, + "step": 10745 + }, + { + "epoch": 0.171936, + "grad_norm": 0.6796875, + "learning_rate": 8.347580645161291e-05, + "loss": 0.1933, + "step": 10746 + }, + { + "epoch": 0.171952, + "grad_norm": 1.0703125, + "learning_rate": 8.347419354838711e-05, + "loss": 0.1886, + "step": 10747 + }, + { + "epoch": 0.171968, + "grad_norm": 0.6640625, + "learning_rate": 8.34725806451613e-05, + "loss": 0.1631, + "step": 10748 + }, + { + "epoch": 0.171984, + "grad_norm": 0.85546875, + "learning_rate": 8.34709677419355e-05, + "loss": 0.1413, + "step": 10749 + }, + { + "epoch": 0.172, + "grad_norm": 0.87890625, + "learning_rate": 8.346935483870968e-05, + "loss": 0.1829, + "step": 10750 + }, + { + "epoch": 0.172016, + "grad_norm": 1.4375, + "learning_rate": 8.346774193548388e-05, + "loss": 0.1851, + "step": 10751 + }, + { + "epoch": 0.172032, + "grad_norm": 1.0546875, + "learning_rate": 8.346612903225807e-05, + "loss": 0.1616, + "step": 10752 + }, + { + "epoch": 0.172048, + "grad_norm": 0.7421875, + "learning_rate": 8.346451612903227e-05, + "loss": 0.2046, + "step": 10753 + }, + { + "epoch": 0.172064, + "grad_norm": 0.96875, + "learning_rate": 8.346290322580645e-05, + "loss": 0.1848, + "step": 10754 + }, + { + "epoch": 0.17208, + "grad_norm": 0.6875, + "learning_rate": 8.346129032258065e-05, + "loss": 0.1636, + "step": 10755 + }, + { + "epoch": 0.172096, + "grad_norm": 0.66796875, + "learning_rate": 8.345967741935484e-05, + "loss": 0.1676, + "step": 10756 + }, + { + "epoch": 0.172112, + "grad_norm": 0.80859375, + "learning_rate": 8.345806451612902e-05, + "loss": 0.1748, + "step": 10757 + }, + { + "epoch": 0.172128, + "grad_norm": 0.79296875, + "learning_rate": 8.345645161290322e-05, + "loss": 0.2084, + "step": 10758 + }, + { + "epoch": 0.172144, + "grad_norm": 0.828125, + "learning_rate": 8.345483870967742e-05, + "loss": 0.1526, + "step": 10759 + }, + { + "epoch": 0.17216, + "grad_norm": 0.80078125, + "learning_rate": 8.345322580645162e-05, + "loss": 0.1509, + "step": 10760 + }, + { + "epoch": 0.172176, + "grad_norm": 1.09375, + "learning_rate": 8.345161290322581e-05, + "loss": 0.1813, + "step": 10761 + }, + { + "epoch": 0.172192, + "grad_norm": 0.53515625, + "learning_rate": 8.345000000000001e-05, + "loss": 0.138, + "step": 10762 + }, + { + "epoch": 0.172208, + "grad_norm": 0.875, + "learning_rate": 8.34483870967742e-05, + "loss": 0.1662, + "step": 10763 + }, + { + "epoch": 0.172224, + "grad_norm": 0.515625, + "learning_rate": 8.34467741935484e-05, + "loss": 0.1371, + "step": 10764 + }, + { + "epoch": 0.17224, + "grad_norm": 0.71875, + "learning_rate": 8.344516129032258e-05, + "loss": 0.1515, + "step": 10765 + }, + { + "epoch": 0.172256, + "grad_norm": 0.71484375, + "learning_rate": 8.344354838709678e-05, + "loss": 0.1903, + "step": 10766 + }, + { + "epoch": 0.172272, + "grad_norm": 0.578125, + "learning_rate": 8.344193548387097e-05, + "loss": 0.1575, + "step": 10767 + }, + { + "epoch": 0.172288, + "grad_norm": 0.6796875, + "learning_rate": 8.344032258064517e-05, + "loss": 0.1938, + "step": 10768 + }, + { + "epoch": 0.172304, + "grad_norm": 0.52734375, + "learning_rate": 8.343870967741935e-05, + "loss": 0.157, + "step": 10769 + }, + { + "epoch": 0.17232, + "grad_norm": 0.84765625, + "learning_rate": 8.343709677419355e-05, + "loss": 0.1659, + "step": 10770 + }, + { + "epoch": 0.172336, + "grad_norm": 0.96875, + "learning_rate": 8.343548387096775e-05, + "loss": 0.1594, + "step": 10771 + }, + { + "epoch": 0.172352, + "grad_norm": 1.546875, + "learning_rate": 8.343387096774195e-05, + "loss": 0.2062, + "step": 10772 + }, + { + "epoch": 0.172368, + "grad_norm": 0.69921875, + "learning_rate": 8.343225806451614e-05, + "loss": 0.1351, + "step": 10773 + }, + { + "epoch": 0.172384, + "grad_norm": 0.75390625, + "learning_rate": 8.343064516129032e-05, + "loss": 0.1855, + "step": 10774 + }, + { + "epoch": 0.1724, + "grad_norm": 0.640625, + "learning_rate": 8.342903225806452e-05, + "loss": 0.159, + "step": 10775 + }, + { + "epoch": 0.172416, + "grad_norm": 0.84765625, + "learning_rate": 8.342741935483871e-05, + "loss": 0.182, + "step": 10776 + }, + { + "epoch": 0.172432, + "grad_norm": 0.953125, + "learning_rate": 8.342580645161291e-05, + "loss": 0.194, + "step": 10777 + }, + { + "epoch": 0.172448, + "grad_norm": 1.0546875, + "learning_rate": 8.34241935483871e-05, + "loss": 0.1496, + "step": 10778 + }, + { + "epoch": 0.172464, + "grad_norm": 1.0546875, + "learning_rate": 8.34225806451613e-05, + "loss": 0.2232, + "step": 10779 + }, + { + "epoch": 0.17248, + "grad_norm": 1.1875, + "learning_rate": 8.342096774193548e-05, + "loss": 0.1964, + "step": 10780 + }, + { + "epoch": 0.172496, + "grad_norm": 0.58984375, + "learning_rate": 8.341935483870968e-05, + "loss": 0.1559, + "step": 10781 + }, + { + "epoch": 0.172512, + "grad_norm": 0.8203125, + "learning_rate": 8.341774193548388e-05, + "loss": 0.1689, + "step": 10782 + }, + { + "epoch": 0.172528, + "grad_norm": 0.69921875, + "learning_rate": 8.341612903225807e-05, + "loss": 0.1986, + "step": 10783 + }, + { + "epoch": 0.172544, + "grad_norm": 0.80859375, + "learning_rate": 8.341451612903227e-05, + "loss": 0.1936, + "step": 10784 + }, + { + "epoch": 0.17256, + "grad_norm": 0.6796875, + "learning_rate": 8.341290322580647e-05, + "loss": 0.1602, + "step": 10785 + }, + { + "epoch": 0.172576, + "grad_norm": 1.265625, + "learning_rate": 8.341129032258065e-05, + "loss": 0.1603, + "step": 10786 + }, + { + "epoch": 0.172592, + "grad_norm": 0.62890625, + "learning_rate": 8.340967741935485e-05, + "loss": 0.1623, + "step": 10787 + }, + { + "epoch": 0.172608, + "grad_norm": 0.8671875, + "learning_rate": 8.340806451612904e-05, + "loss": 0.1717, + "step": 10788 + }, + { + "epoch": 0.172624, + "grad_norm": 0.69140625, + "learning_rate": 8.340645161290322e-05, + "loss": 0.1711, + "step": 10789 + }, + { + "epoch": 0.17264, + "grad_norm": 0.65625, + "learning_rate": 8.340483870967742e-05, + "loss": 0.161, + "step": 10790 + }, + { + "epoch": 0.172656, + "grad_norm": 0.7265625, + "learning_rate": 8.340322580645161e-05, + "loss": 0.1728, + "step": 10791 + }, + { + "epoch": 0.172672, + "grad_norm": 0.83203125, + "learning_rate": 8.340161290322581e-05, + "loss": 0.1595, + "step": 10792 + }, + { + "epoch": 0.172688, + "grad_norm": 0.73828125, + "learning_rate": 8.34e-05, + "loss": 0.1695, + "step": 10793 + }, + { + "epoch": 0.172704, + "grad_norm": 0.82421875, + "learning_rate": 8.33983870967742e-05, + "loss": 0.1826, + "step": 10794 + }, + { + "epoch": 0.17272, + "grad_norm": 1.125, + "learning_rate": 8.33967741935484e-05, + "loss": 0.1837, + "step": 10795 + }, + { + "epoch": 0.172736, + "grad_norm": 0.73828125, + "learning_rate": 8.339516129032259e-05, + "loss": 0.1586, + "step": 10796 + }, + { + "epoch": 0.172752, + "grad_norm": 0.55078125, + "learning_rate": 8.339354838709678e-05, + "loss": 0.1699, + "step": 10797 + }, + { + "epoch": 0.172768, + "grad_norm": 1.3359375, + "learning_rate": 8.339193548387098e-05, + "loss": 0.1844, + "step": 10798 + }, + { + "epoch": 0.172784, + "grad_norm": 0.59375, + "learning_rate": 8.339032258064516e-05, + "loss": 0.1582, + "step": 10799 + }, + { + "epoch": 0.1728, + "grad_norm": 0.6328125, + "learning_rate": 8.338870967741936e-05, + "loss": 0.1668, + "step": 10800 + }, + { + "epoch": 0.172816, + "grad_norm": 0.6953125, + "learning_rate": 8.338709677419355e-05, + "loss": 0.1743, + "step": 10801 + }, + { + "epoch": 0.172832, + "grad_norm": 0.875, + "learning_rate": 8.338548387096775e-05, + "loss": 0.1629, + "step": 10802 + }, + { + "epoch": 0.172848, + "grad_norm": 0.59765625, + "learning_rate": 8.338387096774194e-05, + "loss": 0.1641, + "step": 10803 + }, + { + "epoch": 0.172864, + "grad_norm": 1.234375, + "learning_rate": 8.338225806451612e-05, + "loss": 0.1891, + "step": 10804 + }, + { + "epoch": 0.17288, + "grad_norm": 0.70703125, + "learning_rate": 8.338064516129032e-05, + "loss": 0.1829, + "step": 10805 + }, + { + "epoch": 0.172896, + "grad_norm": 0.7265625, + "learning_rate": 8.337903225806452e-05, + "loss": 0.1508, + "step": 10806 + }, + { + "epoch": 0.172912, + "grad_norm": 1.046875, + "learning_rate": 8.337741935483872e-05, + "loss": 0.1857, + "step": 10807 + }, + { + "epoch": 0.172928, + "grad_norm": 0.7265625, + "learning_rate": 8.337580645161291e-05, + "loss": 0.1383, + "step": 10808 + }, + { + "epoch": 0.172944, + "grad_norm": 0.8828125, + "learning_rate": 8.337419354838711e-05, + "loss": 0.1839, + "step": 10809 + }, + { + "epoch": 0.17296, + "grad_norm": 0.640625, + "learning_rate": 8.337258064516129e-05, + "loss": 0.1959, + "step": 10810 + }, + { + "epoch": 0.172976, + "grad_norm": 0.69140625, + "learning_rate": 8.337096774193549e-05, + "loss": 0.1586, + "step": 10811 + }, + { + "epoch": 0.172992, + "grad_norm": 0.68359375, + "learning_rate": 8.336935483870968e-05, + "loss": 0.1669, + "step": 10812 + }, + { + "epoch": 0.173008, + "grad_norm": 0.73046875, + "learning_rate": 8.336774193548388e-05, + "loss": 0.166, + "step": 10813 + }, + { + "epoch": 0.173024, + "grad_norm": 0.8125, + "learning_rate": 8.336612903225806e-05, + "loss": 0.1279, + "step": 10814 + }, + { + "epoch": 0.17304, + "grad_norm": 1.2890625, + "learning_rate": 8.336451612903226e-05, + "loss": 0.1783, + "step": 10815 + }, + { + "epoch": 0.173056, + "grad_norm": 1.015625, + "learning_rate": 8.336290322580645e-05, + "loss": 0.1938, + "step": 10816 + }, + { + "epoch": 0.173072, + "grad_norm": 0.62890625, + "learning_rate": 8.336129032258065e-05, + "loss": 0.1601, + "step": 10817 + }, + { + "epoch": 0.173088, + "grad_norm": 1.4140625, + "learning_rate": 8.335967741935484e-05, + "loss": 0.1819, + "step": 10818 + }, + { + "epoch": 0.173104, + "grad_norm": 0.88671875, + "learning_rate": 8.335806451612904e-05, + "loss": 0.1609, + "step": 10819 + }, + { + "epoch": 0.17312, + "grad_norm": 0.84765625, + "learning_rate": 8.335645161290324e-05, + "loss": 0.1788, + "step": 10820 + }, + { + "epoch": 0.173136, + "grad_norm": 0.5859375, + "learning_rate": 8.335483870967742e-05, + "loss": 0.1501, + "step": 10821 + }, + { + "epoch": 0.173152, + "grad_norm": 0.97265625, + "learning_rate": 8.335322580645162e-05, + "loss": 0.1769, + "step": 10822 + }, + { + "epoch": 0.173168, + "grad_norm": 0.6875, + "learning_rate": 8.335161290322581e-05, + "loss": 0.1721, + "step": 10823 + }, + { + "epoch": 0.173184, + "grad_norm": 0.71875, + "learning_rate": 8.335e-05, + "loss": 0.1772, + "step": 10824 + }, + { + "epoch": 0.1732, + "grad_norm": 0.515625, + "learning_rate": 8.334838709677419e-05, + "loss": 0.1823, + "step": 10825 + }, + { + "epoch": 0.173216, + "grad_norm": 0.65625, + "learning_rate": 8.334677419354839e-05, + "loss": 0.153, + "step": 10826 + }, + { + "epoch": 0.173232, + "grad_norm": 0.64453125, + "learning_rate": 8.334516129032258e-05, + "loss": 0.1663, + "step": 10827 + }, + { + "epoch": 0.173248, + "grad_norm": 0.52734375, + "learning_rate": 8.334354838709678e-05, + "loss": 0.1416, + "step": 10828 + }, + { + "epoch": 0.173264, + "grad_norm": 1.09375, + "learning_rate": 8.334193548387096e-05, + "loss": 0.159, + "step": 10829 + }, + { + "epoch": 0.17328, + "grad_norm": 0.90625, + "learning_rate": 8.334032258064516e-05, + "loss": 0.1786, + "step": 10830 + }, + { + "epoch": 0.173296, + "grad_norm": 1.2421875, + "learning_rate": 8.333870967741936e-05, + "loss": 0.1761, + "step": 10831 + }, + { + "epoch": 0.173312, + "grad_norm": 1.0390625, + "learning_rate": 8.333709677419356e-05, + "loss": 0.1854, + "step": 10832 + }, + { + "epoch": 0.173328, + "grad_norm": 0.69140625, + "learning_rate": 8.333548387096775e-05, + "loss": 0.1799, + "step": 10833 + }, + { + "epoch": 0.173344, + "grad_norm": 0.94921875, + "learning_rate": 8.333387096774195e-05, + "loss": 0.1664, + "step": 10834 + }, + { + "epoch": 0.17336, + "grad_norm": 1.609375, + "learning_rate": 8.333225806451613e-05, + "loss": 0.2205, + "step": 10835 + }, + { + "epoch": 0.173376, + "grad_norm": 0.53125, + "learning_rate": 8.333064516129032e-05, + "loss": 0.1741, + "step": 10836 + }, + { + "epoch": 0.173392, + "grad_norm": 0.87890625, + "learning_rate": 8.332903225806452e-05, + "loss": 0.1999, + "step": 10837 + }, + { + "epoch": 0.173408, + "grad_norm": 0.51171875, + "learning_rate": 8.33274193548387e-05, + "loss": 0.1665, + "step": 10838 + }, + { + "epoch": 0.173424, + "grad_norm": 0.80859375, + "learning_rate": 8.33258064516129e-05, + "loss": 0.2031, + "step": 10839 + }, + { + "epoch": 0.17344, + "grad_norm": 0.80078125, + "learning_rate": 8.332419354838709e-05, + "loss": 0.1868, + "step": 10840 + }, + { + "epoch": 0.173456, + "grad_norm": 0.6484375, + "learning_rate": 8.332258064516129e-05, + "loss": 0.1523, + "step": 10841 + }, + { + "epoch": 0.173472, + "grad_norm": 1.0859375, + "learning_rate": 8.332096774193549e-05, + "loss": 0.1797, + "step": 10842 + }, + { + "epoch": 0.173488, + "grad_norm": 0.97265625, + "learning_rate": 8.331935483870969e-05, + "loss": 0.207, + "step": 10843 + }, + { + "epoch": 0.173504, + "grad_norm": 0.7265625, + "learning_rate": 8.331774193548388e-05, + "loss": 0.1473, + "step": 10844 + }, + { + "epoch": 0.17352, + "grad_norm": 1.03125, + "learning_rate": 8.331612903225808e-05, + "loss": 0.1672, + "step": 10845 + }, + { + "epoch": 0.173536, + "grad_norm": 0.5546875, + "learning_rate": 8.331451612903226e-05, + "loss": 0.1735, + "step": 10846 + }, + { + "epoch": 0.173552, + "grad_norm": 0.546875, + "learning_rate": 8.331290322580646e-05, + "loss": 0.1823, + "step": 10847 + }, + { + "epoch": 0.173568, + "grad_norm": 0.5859375, + "learning_rate": 8.331129032258065e-05, + "loss": 0.1572, + "step": 10848 + }, + { + "epoch": 0.173584, + "grad_norm": 0.87109375, + "learning_rate": 8.330967741935485e-05, + "loss": 0.1406, + "step": 10849 + }, + { + "epoch": 0.1736, + "grad_norm": 0.55859375, + "learning_rate": 8.330806451612903e-05, + "loss": 0.1578, + "step": 10850 + }, + { + "epoch": 0.173616, + "grad_norm": 0.81640625, + "learning_rate": 8.330645161290322e-05, + "loss": 0.1779, + "step": 10851 + }, + { + "epoch": 0.173632, + "grad_norm": 0.59765625, + "learning_rate": 8.330483870967742e-05, + "loss": 0.1376, + "step": 10852 + }, + { + "epoch": 0.173648, + "grad_norm": 0.9140625, + "learning_rate": 8.33032258064516e-05, + "loss": 0.2204, + "step": 10853 + }, + { + "epoch": 0.173664, + "grad_norm": 0.5390625, + "learning_rate": 8.33016129032258e-05, + "loss": 0.1347, + "step": 10854 + }, + { + "epoch": 0.17368, + "grad_norm": 1.203125, + "learning_rate": 8.33e-05, + "loss": 0.1947, + "step": 10855 + }, + { + "epoch": 0.173696, + "grad_norm": 1.046875, + "learning_rate": 8.32983870967742e-05, + "loss": 0.1469, + "step": 10856 + }, + { + "epoch": 0.173712, + "grad_norm": 0.6640625, + "learning_rate": 8.329677419354839e-05, + "loss": 0.1429, + "step": 10857 + }, + { + "epoch": 0.173728, + "grad_norm": 1.015625, + "learning_rate": 8.329516129032259e-05, + "loss": 0.183, + "step": 10858 + }, + { + "epoch": 0.173744, + "grad_norm": 0.78125, + "learning_rate": 8.329354838709678e-05, + "loss": 0.1667, + "step": 10859 + }, + { + "epoch": 0.17376, + "grad_norm": 0.99609375, + "learning_rate": 8.329193548387098e-05, + "loss": 0.1846, + "step": 10860 + }, + { + "epoch": 0.173776, + "grad_norm": 0.72265625, + "learning_rate": 8.329032258064516e-05, + "loss": 0.1897, + "step": 10861 + }, + { + "epoch": 0.173792, + "grad_norm": 1.0546875, + "learning_rate": 8.328870967741936e-05, + "loss": 0.1493, + "step": 10862 + }, + { + "epoch": 0.173808, + "grad_norm": 0.7578125, + "learning_rate": 8.328709677419355e-05, + "loss": 0.2025, + "step": 10863 + }, + { + "epoch": 0.173824, + "grad_norm": 0.66015625, + "learning_rate": 8.328548387096775e-05, + "loss": 0.1373, + "step": 10864 + }, + { + "epoch": 0.17384, + "grad_norm": 1.34375, + "learning_rate": 8.328387096774193e-05, + "loss": 0.1949, + "step": 10865 + }, + { + "epoch": 0.173856, + "grad_norm": 0.63671875, + "learning_rate": 8.328225806451613e-05, + "loss": 0.1515, + "step": 10866 + }, + { + "epoch": 0.173872, + "grad_norm": 0.7109375, + "learning_rate": 8.328064516129033e-05, + "loss": 0.1733, + "step": 10867 + }, + { + "epoch": 0.173888, + "grad_norm": 0.68359375, + "learning_rate": 8.327903225806452e-05, + "loss": 0.1693, + "step": 10868 + }, + { + "epoch": 0.173904, + "grad_norm": 0.90234375, + "learning_rate": 8.327741935483872e-05, + "loss": 0.1535, + "step": 10869 + }, + { + "epoch": 0.17392, + "grad_norm": 0.6796875, + "learning_rate": 8.32758064516129e-05, + "loss": 0.1519, + "step": 10870 + }, + { + "epoch": 0.173936, + "grad_norm": 0.9375, + "learning_rate": 8.32741935483871e-05, + "loss": 0.1821, + "step": 10871 + }, + { + "epoch": 0.173952, + "grad_norm": 0.97265625, + "learning_rate": 8.327258064516129e-05, + "loss": 0.1837, + "step": 10872 + }, + { + "epoch": 0.173968, + "grad_norm": 0.5625, + "learning_rate": 8.327096774193549e-05, + "loss": 0.1658, + "step": 10873 + }, + { + "epoch": 0.173984, + "grad_norm": 0.7265625, + "learning_rate": 8.326935483870968e-05, + "loss": 0.1602, + "step": 10874 + }, + { + "epoch": 0.174, + "grad_norm": 0.98828125, + "learning_rate": 8.326774193548388e-05, + "loss": 0.2205, + "step": 10875 + }, + { + "epoch": 0.174016, + "grad_norm": 0.94140625, + "learning_rate": 8.326612903225806e-05, + "loss": 0.1806, + "step": 10876 + }, + { + "epoch": 0.174032, + "grad_norm": 0.8203125, + "learning_rate": 8.326451612903226e-05, + "loss": 0.1626, + "step": 10877 + }, + { + "epoch": 0.174048, + "grad_norm": 0.6484375, + "learning_rate": 8.326290322580645e-05, + "loss": 0.1395, + "step": 10878 + }, + { + "epoch": 0.174064, + "grad_norm": 0.54296875, + "learning_rate": 8.326129032258065e-05, + "loss": 0.1493, + "step": 10879 + }, + { + "epoch": 0.17408, + "grad_norm": 0.9375, + "learning_rate": 8.325967741935485e-05, + "loss": 0.2002, + "step": 10880 + }, + { + "epoch": 0.174096, + "grad_norm": 0.78515625, + "learning_rate": 8.325806451612905e-05, + "loss": 0.1222, + "step": 10881 + }, + { + "epoch": 0.174112, + "grad_norm": 1.125, + "learning_rate": 8.325645161290323e-05, + "loss": 0.1918, + "step": 10882 + }, + { + "epoch": 0.174128, + "grad_norm": 0.83203125, + "learning_rate": 8.325483870967742e-05, + "loss": 0.1962, + "step": 10883 + }, + { + "epoch": 0.174144, + "grad_norm": 0.765625, + "learning_rate": 8.325322580645162e-05, + "loss": 0.2016, + "step": 10884 + }, + { + "epoch": 0.17416, + "grad_norm": 0.9140625, + "learning_rate": 8.32516129032258e-05, + "loss": 0.1931, + "step": 10885 + }, + { + "epoch": 0.174176, + "grad_norm": 0.6015625, + "learning_rate": 8.325e-05, + "loss": 0.1606, + "step": 10886 + }, + { + "epoch": 0.174192, + "grad_norm": 0.63671875, + "learning_rate": 8.324838709677419e-05, + "loss": 0.1697, + "step": 10887 + }, + { + "epoch": 0.174208, + "grad_norm": 0.6875, + "learning_rate": 8.324677419354839e-05, + "loss": 0.1254, + "step": 10888 + }, + { + "epoch": 0.174224, + "grad_norm": 0.57421875, + "learning_rate": 8.324516129032258e-05, + "loss": 0.1442, + "step": 10889 + }, + { + "epoch": 0.17424, + "grad_norm": 0.6328125, + "learning_rate": 8.324354838709678e-05, + "loss": 0.1514, + "step": 10890 + }, + { + "epoch": 0.174256, + "grad_norm": 0.84765625, + "learning_rate": 8.324193548387098e-05, + "loss": 0.1867, + "step": 10891 + }, + { + "epoch": 0.174272, + "grad_norm": 0.70703125, + "learning_rate": 8.324032258064517e-05, + "loss": 0.1994, + "step": 10892 + }, + { + "epoch": 0.174288, + "grad_norm": 0.75390625, + "learning_rate": 8.323870967741936e-05, + "loss": 0.1903, + "step": 10893 + }, + { + "epoch": 0.174304, + "grad_norm": 0.828125, + "learning_rate": 8.323709677419356e-05, + "loss": 0.1214, + "step": 10894 + }, + { + "epoch": 0.17432, + "grad_norm": 1.0, + "learning_rate": 8.323548387096775e-05, + "loss": 0.1769, + "step": 10895 + }, + { + "epoch": 0.174336, + "grad_norm": 1.1875, + "learning_rate": 8.323387096774195e-05, + "loss": 0.1665, + "step": 10896 + }, + { + "epoch": 0.174352, + "grad_norm": 0.5859375, + "learning_rate": 8.323225806451613e-05, + "loss": 0.1722, + "step": 10897 + }, + { + "epoch": 0.174368, + "grad_norm": 1.140625, + "learning_rate": 8.323064516129032e-05, + "loss": 0.1866, + "step": 10898 + }, + { + "epoch": 0.174384, + "grad_norm": 0.7265625, + "learning_rate": 8.322903225806452e-05, + "loss": 0.1626, + "step": 10899 + }, + { + "epoch": 0.1744, + "grad_norm": 0.9140625, + "learning_rate": 8.32274193548387e-05, + "loss": 0.2086, + "step": 10900 + }, + { + "epoch": 0.174416, + "grad_norm": 0.8203125, + "learning_rate": 8.32258064516129e-05, + "loss": 0.1739, + "step": 10901 + }, + { + "epoch": 0.174432, + "grad_norm": 0.8046875, + "learning_rate": 8.32241935483871e-05, + "loss": 0.1587, + "step": 10902 + }, + { + "epoch": 0.174448, + "grad_norm": 0.8359375, + "learning_rate": 8.32225806451613e-05, + "loss": 0.1746, + "step": 10903 + }, + { + "epoch": 0.174464, + "grad_norm": 0.59375, + "learning_rate": 8.322096774193549e-05, + "loss": 0.1253, + "step": 10904 + }, + { + "epoch": 0.17448, + "grad_norm": 1.1875, + "learning_rate": 8.321935483870969e-05, + "loss": 0.2136, + "step": 10905 + }, + { + "epoch": 0.174496, + "grad_norm": 0.65625, + "learning_rate": 8.321774193548387e-05, + "loss": 0.1549, + "step": 10906 + }, + { + "epoch": 0.174512, + "grad_norm": 0.60546875, + "learning_rate": 8.321612903225807e-05, + "loss": 0.1756, + "step": 10907 + }, + { + "epoch": 0.174528, + "grad_norm": 0.76953125, + "learning_rate": 8.321451612903226e-05, + "loss": 0.158, + "step": 10908 + }, + { + "epoch": 0.174544, + "grad_norm": 0.5703125, + "learning_rate": 8.321290322580646e-05, + "loss": 0.1626, + "step": 10909 + }, + { + "epoch": 0.17456, + "grad_norm": 1.0546875, + "learning_rate": 8.321129032258065e-05, + "loss": 0.1557, + "step": 10910 + }, + { + "epoch": 0.174576, + "grad_norm": 1.3046875, + "learning_rate": 8.320967741935485e-05, + "loss": 0.2365, + "step": 10911 + }, + { + "epoch": 0.174592, + "grad_norm": 0.78515625, + "learning_rate": 8.320806451612903e-05, + "loss": 0.1556, + "step": 10912 + }, + { + "epoch": 0.174608, + "grad_norm": 0.62890625, + "learning_rate": 8.320645161290322e-05, + "loss": 0.1689, + "step": 10913 + }, + { + "epoch": 0.174624, + "grad_norm": 0.89453125, + "learning_rate": 8.320483870967742e-05, + "loss": 0.198, + "step": 10914 + }, + { + "epoch": 0.17464, + "grad_norm": 0.78515625, + "learning_rate": 8.320322580645162e-05, + "loss": 0.178, + "step": 10915 + }, + { + "epoch": 0.174656, + "grad_norm": 0.5234375, + "learning_rate": 8.320161290322582e-05, + "loss": 0.1324, + "step": 10916 + }, + { + "epoch": 0.174672, + "grad_norm": 1.1796875, + "learning_rate": 8.32e-05, + "loss": 0.1954, + "step": 10917 + }, + { + "epoch": 0.174688, + "grad_norm": 0.72265625, + "learning_rate": 8.31983870967742e-05, + "loss": 0.1616, + "step": 10918 + }, + { + "epoch": 0.174704, + "grad_norm": 1.375, + "learning_rate": 8.319677419354839e-05, + "loss": 0.2221, + "step": 10919 + }, + { + "epoch": 0.17472, + "grad_norm": 0.6484375, + "learning_rate": 8.319516129032259e-05, + "loss": 0.1928, + "step": 10920 + }, + { + "epoch": 0.174736, + "grad_norm": 0.703125, + "learning_rate": 8.319354838709677e-05, + "loss": 0.1707, + "step": 10921 + }, + { + "epoch": 0.174752, + "grad_norm": 0.87109375, + "learning_rate": 8.319193548387097e-05, + "loss": 0.1775, + "step": 10922 + }, + { + "epoch": 0.174768, + "grad_norm": 0.71875, + "learning_rate": 8.319032258064516e-05, + "loss": 0.1773, + "step": 10923 + }, + { + "epoch": 0.174784, + "grad_norm": 0.515625, + "learning_rate": 8.318870967741936e-05, + "loss": 0.1202, + "step": 10924 + }, + { + "epoch": 0.1748, + "grad_norm": 0.89453125, + "learning_rate": 8.318709677419355e-05, + "loss": 0.1888, + "step": 10925 + }, + { + "epoch": 0.174816, + "grad_norm": 0.53515625, + "learning_rate": 8.318548387096775e-05, + "loss": 0.1825, + "step": 10926 + }, + { + "epoch": 0.174832, + "grad_norm": 0.83984375, + "learning_rate": 8.318387096774195e-05, + "loss": 0.182, + "step": 10927 + }, + { + "epoch": 0.174848, + "grad_norm": 0.85546875, + "learning_rate": 8.318225806451613e-05, + "loss": 0.1741, + "step": 10928 + }, + { + "epoch": 0.174864, + "grad_norm": 1.0703125, + "learning_rate": 8.318064516129033e-05, + "loss": 0.1214, + "step": 10929 + }, + { + "epoch": 0.17488, + "grad_norm": 0.5390625, + "learning_rate": 8.317903225806452e-05, + "loss": 0.1584, + "step": 10930 + }, + { + "epoch": 0.174896, + "grad_norm": 0.8125, + "learning_rate": 8.317741935483872e-05, + "loss": 0.1778, + "step": 10931 + }, + { + "epoch": 0.174912, + "grad_norm": 0.703125, + "learning_rate": 8.31758064516129e-05, + "loss": 0.1514, + "step": 10932 + }, + { + "epoch": 0.174928, + "grad_norm": 0.92578125, + "learning_rate": 8.31741935483871e-05, + "loss": 0.1844, + "step": 10933 + }, + { + "epoch": 0.174944, + "grad_norm": 0.671875, + "learning_rate": 8.317258064516129e-05, + "loss": 0.1713, + "step": 10934 + }, + { + "epoch": 0.17496, + "grad_norm": 1.09375, + "learning_rate": 8.317096774193549e-05, + "loss": 0.1677, + "step": 10935 + }, + { + "epoch": 0.174976, + "grad_norm": 0.7265625, + "learning_rate": 8.316935483870967e-05, + "loss": 0.1857, + "step": 10936 + }, + { + "epoch": 0.174992, + "grad_norm": 1.359375, + "learning_rate": 8.316774193548387e-05, + "loss": 0.2426, + "step": 10937 + }, + { + "epoch": 0.175008, + "grad_norm": 0.80859375, + "learning_rate": 8.316612903225807e-05, + "loss": 0.1746, + "step": 10938 + }, + { + "epoch": 0.175024, + "grad_norm": 0.80078125, + "learning_rate": 8.316451612903226e-05, + "loss": 0.1934, + "step": 10939 + }, + { + "epoch": 0.17504, + "grad_norm": 0.640625, + "learning_rate": 8.316290322580646e-05, + "loss": 0.1566, + "step": 10940 + }, + { + "epoch": 0.175056, + "grad_norm": 0.8515625, + "learning_rate": 8.316129032258066e-05, + "loss": 0.1756, + "step": 10941 + }, + { + "epoch": 0.175072, + "grad_norm": 0.66796875, + "learning_rate": 8.315967741935484e-05, + "loss": 0.1674, + "step": 10942 + }, + { + "epoch": 0.175088, + "grad_norm": 0.703125, + "learning_rate": 8.315806451612904e-05, + "loss": 0.1351, + "step": 10943 + }, + { + "epoch": 0.175104, + "grad_norm": 0.58203125, + "learning_rate": 8.315645161290323e-05, + "loss": 0.1299, + "step": 10944 + }, + { + "epoch": 0.17512, + "grad_norm": 0.96484375, + "learning_rate": 8.315483870967742e-05, + "loss": 0.1771, + "step": 10945 + }, + { + "epoch": 0.175136, + "grad_norm": 0.8828125, + "learning_rate": 8.315322580645162e-05, + "loss": 0.1764, + "step": 10946 + }, + { + "epoch": 0.175152, + "grad_norm": 0.640625, + "learning_rate": 8.31516129032258e-05, + "loss": 0.1507, + "step": 10947 + }, + { + "epoch": 0.175168, + "grad_norm": 0.95703125, + "learning_rate": 8.315e-05, + "loss": 0.1767, + "step": 10948 + }, + { + "epoch": 0.175184, + "grad_norm": 0.640625, + "learning_rate": 8.314838709677419e-05, + "loss": 0.1566, + "step": 10949 + }, + { + "epoch": 0.1752, + "grad_norm": 1.265625, + "learning_rate": 8.314677419354839e-05, + "loss": 0.1973, + "step": 10950 + }, + { + "epoch": 0.175216, + "grad_norm": 0.734375, + "learning_rate": 8.314516129032259e-05, + "loss": 0.1529, + "step": 10951 + }, + { + "epoch": 0.175232, + "grad_norm": 0.578125, + "learning_rate": 8.314354838709679e-05, + "loss": 0.128, + "step": 10952 + }, + { + "epoch": 0.175248, + "grad_norm": 0.67578125, + "learning_rate": 8.314193548387097e-05, + "loss": 0.1486, + "step": 10953 + }, + { + "epoch": 0.175264, + "grad_norm": 0.67578125, + "learning_rate": 8.314032258064517e-05, + "loss": 0.1443, + "step": 10954 + }, + { + "epoch": 0.17528, + "grad_norm": 1.2890625, + "learning_rate": 8.313870967741936e-05, + "loss": 0.1861, + "step": 10955 + }, + { + "epoch": 0.175296, + "grad_norm": 0.828125, + "learning_rate": 8.313709677419356e-05, + "loss": 0.1916, + "step": 10956 + }, + { + "epoch": 0.175312, + "grad_norm": 0.859375, + "learning_rate": 8.313548387096774e-05, + "loss": 0.2246, + "step": 10957 + }, + { + "epoch": 0.175328, + "grad_norm": 0.84765625, + "learning_rate": 8.313387096774194e-05, + "loss": 0.2085, + "step": 10958 + }, + { + "epoch": 0.175344, + "grad_norm": 0.91015625, + "learning_rate": 8.313225806451613e-05, + "loss": 0.1835, + "step": 10959 + }, + { + "epoch": 0.17536, + "grad_norm": 1.265625, + "learning_rate": 8.313064516129032e-05, + "loss": 0.1976, + "step": 10960 + }, + { + "epoch": 0.175376, + "grad_norm": 1.0390625, + "learning_rate": 8.312903225806452e-05, + "loss": 0.1654, + "step": 10961 + }, + { + "epoch": 0.175392, + "grad_norm": 0.8125, + "learning_rate": 8.312741935483872e-05, + "loss": 0.2187, + "step": 10962 + }, + { + "epoch": 0.175408, + "grad_norm": 0.71484375, + "learning_rate": 8.312580645161291e-05, + "loss": 0.1876, + "step": 10963 + }, + { + "epoch": 0.175424, + "grad_norm": 0.8125, + "learning_rate": 8.31241935483871e-05, + "loss": 0.1577, + "step": 10964 + }, + { + "epoch": 0.17544, + "grad_norm": 0.76953125, + "learning_rate": 8.31225806451613e-05, + "loss": 0.1428, + "step": 10965 + }, + { + "epoch": 0.175456, + "grad_norm": 1.5390625, + "learning_rate": 8.312096774193549e-05, + "loss": 0.1878, + "step": 10966 + }, + { + "epoch": 0.175472, + "grad_norm": 0.96875, + "learning_rate": 8.311935483870969e-05, + "loss": 0.1616, + "step": 10967 + }, + { + "epoch": 0.175488, + "grad_norm": 0.703125, + "learning_rate": 8.311774193548387e-05, + "loss": 0.1926, + "step": 10968 + }, + { + "epoch": 0.175504, + "grad_norm": 0.57421875, + "learning_rate": 8.311612903225807e-05, + "loss": 0.1689, + "step": 10969 + }, + { + "epoch": 0.17552, + "grad_norm": 0.62109375, + "learning_rate": 8.311451612903226e-05, + "loss": 0.1712, + "step": 10970 + }, + { + "epoch": 0.175536, + "grad_norm": 1.0625, + "learning_rate": 8.311290322580646e-05, + "loss": 0.1878, + "step": 10971 + }, + { + "epoch": 0.175552, + "grad_norm": 0.4921875, + "learning_rate": 8.311129032258064e-05, + "loss": 0.1555, + "step": 10972 + }, + { + "epoch": 0.175568, + "grad_norm": 0.81640625, + "learning_rate": 8.310967741935484e-05, + "loss": 0.1504, + "step": 10973 + }, + { + "epoch": 0.175584, + "grad_norm": 1.9375, + "learning_rate": 8.310806451612903e-05, + "loss": 0.2228, + "step": 10974 + }, + { + "epoch": 0.1756, + "grad_norm": 1.5859375, + "learning_rate": 8.310645161290323e-05, + "loss": 0.1539, + "step": 10975 + }, + { + "epoch": 0.175616, + "grad_norm": 1.1796875, + "learning_rate": 8.310483870967743e-05, + "loss": 0.2139, + "step": 10976 + }, + { + "epoch": 0.175632, + "grad_norm": 0.859375, + "learning_rate": 8.310322580645161e-05, + "loss": 0.2088, + "step": 10977 + }, + { + "epoch": 0.175648, + "grad_norm": 0.62890625, + "learning_rate": 8.310161290322581e-05, + "loss": 0.1181, + "step": 10978 + }, + { + "epoch": 0.175664, + "grad_norm": 0.69140625, + "learning_rate": 8.31e-05, + "loss": 0.1723, + "step": 10979 + }, + { + "epoch": 0.17568, + "grad_norm": 0.69921875, + "learning_rate": 8.30983870967742e-05, + "loss": 0.195, + "step": 10980 + }, + { + "epoch": 0.175696, + "grad_norm": 0.55078125, + "learning_rate": 8.309677419354839e-05, + "loss": 0.1624, + "step": 10981 + }, + { + "epoch": 0.175712, + "grad_norm": 0.93359375, + "learning_rate": 8.309516129032259e-05, + "loss": 0.1839, + "step": 10982 + }, + { + "epoch": 0.175728, + "grad_norm": 0.60546875, + "learning_rate": 8.309354838709677e-05, + "loss": 0.1663, + "step": 10983 + }, + { + "epoch": 0.175744, + "grad_norm": 0.75390625, + "learning_rate": 8.309193548387097e-05, + "loss": 0.1979, + "step": 10984 + }, + { + "epoch": 0.17576, + "grad_norm": 0.76953125, + "learning_rate": 8.309032258064516e-05, + "loss": 0.1877, + "step": 10985 + }, + { + "epoch": 0.175776, + "grad_norm": 0.8359375, + "learning_rate": 8.308870967741936e-05, + "loss": 0.1745, + "step": 10986 + }, + { + "epoch": 0.175792, + "grad_norm": 0.78125, + "learning_rate": 8.308709677419356e-05, + "loss": 0.1756, + "step": 10987 + }, + { + "epoch": 0.175808, + "grad_norm": 0.7421875, + "learning_rate": 8.308548387096776e-05, + "loss": 0.1744, + "step": 10988 + }, + { + "epoch": 0.175824, + "grad_norm": 0.79296875, + "learning_rate": 8.308387096774194e-05, + "loss": 0.1795, + "step": 10989 + }, + { + "epoch": 0.17584, + "grad_norm": 0.86328125, + "learning_rate": 8.308225806451613e-05, + "loss": 0.1471, + "step": 10990 + }, + { + "epoch": 0.175856, + "grad_norm": 1.0, + "learning_rate": 8.308064516129033e-05, + "loss": 0.1636, + "step": 10991 + }, + { + "epoch": 0.175872, + "grad_norm": 0.70703125, + "learning_rate": 8.307903225806451e-05, + "loss": 0.1635, + "step": 10992 + }, + { + "epoch": 0.175888, + "grad_norm": 0.73046875, + "learning_rate": 8.307741935483871e-05, + "loss": 0.1431, + "step": 10993 + }, + { + "epoch": 0.175904, + "grad_norm": 0.82421875, + "learning_rate": 8.30758064516129e-05, + "loss": 0.1753, + "step": 10994 + }, + { + "epoch": 0.17592, + "grad_norm": 0.66015625, + "learning_rate": 8.30741935483871e-05, + "loss": 0.1646, + "step": 10995 + }, + { + "epoch": 0.175936, + "grad_norm": 1.0625, + "learning_rate": 8.307258064516129e-05, + "loss": 0.189, + "step": 10996 + }, + { + "epoch": 0.175952, + "grad_norm": 0.890625, + "learning_rate": 8.307096774193549e-05, + "loss": 0.1847, + "step": 10997 + }, + { + "epoch": 0.175968, + "grad_norm": 0.671875, + "learning_rate": 8.306935483870969e-05, + "loss": 0.1904, + "step": 10998 + }, + { + "epoch": 0.175984, + "grad_norm": 0.8125, + "learning_rate": 8.306774193548388e-05, + "loss": 0.1722, + "step": 10999 + }, + { + "epoch": 0.176, + "grad_norm": 0.7890625, + "learning_rate": 8.306612903225807e-05, + "loss": 0.1695, + "step": 11000 + }, + { + "epoch": 0.176016, + "grad_norm": 0.9921875, + "learning_rate": 8.306451612903227e-05, + "loss": 0.1797, + "step": 11001 + }, + { + "epoch": 0.176032, + "grad_norm": 1.0078125, + "learning_rate": 8.306290322580646e-05, + "loss": 0.1682, + "step": 11002 + }, + { + "epoch": 0.176048, + "grad_norm": 0.400390625, + "learning_rate": 8.306129032258066e-05, + "loss": 0.1287, + "step": 11003 + }, + { + "epoch": 0.176064, + "grad_norm": 0.734375, + "learning_rate": 8.305967741935484e-05, + "loss": 0.1563, + "step": 11004 + }, + { + "epoch": 0.17608, + "grad_norm": 0.54296875, + "learning_rate": 8.305806451612904e-05, + "loss": 0.1428, + "step": 11005 + }, + { + "epoch": 0.176096, + "grad_norm": 0.64453125, + "learning_rate": 8.305645161290323e-05, + "loss": 0.1839, + "step": 11006 + }, + { + "epoch": 0.176112, + "grad_norm": 0.765625, + "learning_rate": 8.305483870967741e-05, + "loss": 0.1624, + "step": 11007 + }, + { + "epoch": 0.176128, + "grad_norm": 0.91796875, + "learning_rate": 8.305322580645161e-05, + "loss": 0.1438, + "step": 11008 + }, + { + "epoch": 0.176144, + "grad_norm": 0.88671875, + "learning_rate": 8.30516129032258e-05, + "loss": 0.1482, + "step": 11009 + }, + { + "epoch": 0.17616, + "grad_norm": 0.8828125, + "learning_rate": 8.305e-05, + "loss": 0.1849, + "step": 11010 + }, + { + "epoch": 0.176176, + "grad_norm": 0.75390625, + "learning_rate": 8.30483870967742e-05, + "loss": 0.1674, + "step": 11011 + }, + { + "epoch": 0.176192, + "grad_norm": 0.84765625, + "learning_rate": 8.30467741935484e-05, + "loss": 0.1493, + "step": 11012 + }, + { + "epoch": 0.176208, + "grad_norm": 0.8984375, + "learning_rate": 8.304516129032258e-05, + "loss": 0.1669, + "step": 11013 + }, + { + "epoch": 0.176224, + "grad_norm": 0.57421875, + "learning_rate": 8.304354838709678e-05, + "loss": 0.1706, + "step": 11014 + }, + { + "epoch": 0.17624, + "grad_norm": 0.5234375, + "learning_rate": 8.304193548387097e-05, + "loss": 0.1736, + "step": 11015 + }, + { + "epoch": 0.176256, + "grad_norm": 0.73046875, + "learning_rate": 8.304032258064517e-05, + "loss": 0.1479, + "step": 11016 + }, + { + "epoch": 0.176272, + "grad_norm": 0.9140625, + "learning_rate": 8.303870967741936e-05, + "loss": 0.2007, + "step": 11017 + }, + { + "epoch": 0.176288, + "grad_norm": 0.671875, + "learning_rate": 8.303709677419356e-05, + "loss": 0.1604, + "step": 11018 + }, + { + "epoch": 0.176304, + "grad_norm": 0.64453125, + "learning_rate": 8.303548387096774e-05, + "loss": 0.2038, + "step": 11019 + }, + { + "epoch": 0.17632, + "grad_norm": 1.1015625, + "learning_rate": 8.303387096774194e-05, + "loss": 0.1933, + "step": 11020 + }, + { + "epoch": 0.176336, + "grad_norm": 0.76953125, + "learning_rate": 8.303225806451613e-05, + "loss": 0.191, + "step": 11021 + }, + { + "epoch": 0.176352, + "grad_norm": 0.5703125, + "learning_rate": 8.303064516129033e-05, + "loss": 0.1233, + "step": 11022 + }, + { + "epoch": 0.176368, + "grad_norm": 1.0546875, + "learning_rate": 8.302903225806453e-05, + "loss": 0.2136, + "step": 11023 + }, + { + "epoch": 0.176384, + "grad_norm": 0.78125, + "learning_rate": 8.302741935483871e-05, + "loss": 0.1392, + "step": 11024 + }, + { + "epoch": 0.1764, + "grad_norm": 1.03125, + "learning_rate": 8.302580645161291e-05, + "loss": 0.1882, + "step": 11025 + }, + { + "epoch": 0.176416, + "grad_norm": 0.63671875, + "learning_rate": 8.30241935483871e-05, + "loss": 0.1574, + "step": 11026 + }, + { + "epoch": 0.176432, + "grad_norm": 0.80078125, + "learning_rate": 8.30225806451613e-05, + "loss": 0.1887, + "step": 11027 + }, + { + "epoch": 0.176448, + "grad_norm": 0.82421875, + "learning_rate": 8.302096774193548e-05, + "loss": 0.1797, + "step": 11028 + }, + { + "epoch": 0.176464, + "grad_norm": 0.7421875, + "learning_rate": 8.301935483870968e-05, + "loss": 0.1939, + "step": 11029 + }, + { + "epoch": 0.17648, + "grad_norm": 0.8984375, + "learning_rate": 8.301774193548387e-05, + "loss": 0.1586, + "step": 11030 + }, + { + "epoch": 0.176496, + "grad_norm": 0.69921875, + "learning_rate": 8.301612903225807e-05, + "loss": 0.1359, + "step": 11031 + }, + { + "epoch": 0.176512, + "grad_norm": 0.87109375, + "learning_rate": 8.301451612903226e-05, + "loss": 0.1916, + "step": 11032 + }, + { + "epoch": 0.176528, + "grad_norm": 0.64453125, + "learning_rate": 8.301290322580646e-05, + "loss": 0.1484, + "step": 11033 + }, + { + "epoch": 0.176544, + "grad_norm": 0.8359375, + "learning_rate": 8.301129032258065e-05, + "loss": 0.1685, + "step": 11034 + }, + { + "epoch": 0.17656, + "grad_norm": 0.53515625, + "learning_rate": 8.300967741935484e-05, + "loss": 0.155, + "step": 11035 + }, + { + "epoch": 0.176576, + "grad_norm": 1.03125, + "learning_rate": 8.300806451612904e-05, + "loss": 0.1865, + "step": 11036 + }, + { + "epoch": 0.176592, + "grad_norm": 0.90625, + "learning_rate": 8.300645161290323e-05, + "loss": 0.1715, + "step": 11037 + }, + { + "epoch": 0.176608, + "grad_norm": 0.546875, + "learning_rate": 8.300483870967743e-05, + "loss": 0.1543, + "step": 11038 + }, + { + "epoch": 0.176624, + "grad_norm": 1.2890625, + "learning_rate": 8.300322580645161e-05, + "loss": 0.2415, + "step": 11039 + }, + { + "epoch": 0.17664, + "grad_norm": 1.21875, + "learning_rate": 8.300161290322581e-05, + "loss": 0.1866, + "step": 11040 + }, + { + "epoch": 0.176656, + "grad_norm": 0.89453125, + "learning_rate": 8.3e-05, + "loss": 0.1666, + "step": 11041 + }, + { + "epoch": 0.176672, + "grad_norm": 1.8203125, + "learning_rate": 8.29983870967742e-05, + "loss": 0.1549, + "step": 11042 + }, + { + "epoch": 0.176688, + "grad_norm": 1.2734375, + "learning_rate": 8.299677419354838e-05, + "loss": 0.1865, + "step": 11043 + }, + { + "epoch": 0.176704, + "grad_norm": 0.83984375, + "learning_rate": 8.299516129032258e-05, + "loss": 0.1948, + "step": 11044 + }, + { + "epoch": 0.17672, + "grad_norm": 1.21875, + "learning_rate": 8.299354838709677e-05, + "loss": 0.2148, + "step": 11045 + }, + { + "epoch": 0.176736, + "grad_norm": 0.859375, + "learning_rate": 8.299193548387097e-05, + "loss": 0.1957, + "step": 11046 + }, + { + "epoch": 0.176752, + "grad_norm": 0.875, + "learning_rate": 8.299032258064517e-05, + "loss": 0.1581, + "step": 11047 + }, + { + "epoch": 0.176768, + "grad_norm": 0.7890625, + "learning_rate": 8.298870967741937e-05, + "loss": 0.1816, + "step": 11048 + }, + { + "epoch": 0.176784, + "grad_norm": 0.5625, + "learning_rate": 8.298709677419355e-05, + "loss": 0.1445, + "step": 11049 + }, + { + "epoch": 0.1768, + "grad_norm": 1.078125, + "learning_rate": 8.298548387096775e-05, + "loss": 0.1659, + "step": 11050 + }, + { + "epoch": 0.176816, + "grad_norm": 0.578125, + "learning_rate": 8.298387096774194e-05, + "loss": 0.1629, + "step": 11051 + }, + { + "epoch": 0.176832, + "grad_norm": 1.5, + "learning_rate": 8.298225806451614e-05, + "loss": 0.2336, + "step": 11052 + }, + { + "epoch": 0.176848, + "grad_norm": 0.59375, + "learning_rate": 8.298064516129033e-05, + "loss": 0.1625, + "step": 11053 + }, + { + "epoch": 0.176864, + "grad_norm": 0.765625, + "learning_rate": 8.297903225806451e-05, + "loss": 0.1358, + "step": 11054 + }, + { + "epoch": 0.17688, + "grad_norm": 0.7890625, + "learning_rate": 8.297741935483871e-05, + "loss": 0.1768, + "step": 11055 + }, + { + "epoch": 0.176896, + "grad_norm": 0.6875, + "learning_rate": 8.29758064516129e-05, + "loss": 0.1917, + "step": 11056 + }, + { + "epoch": 0.176912, + "grad_norm": 0.55859375, + "learning_rate": 8.29741935483871e-05, + "loss": 0.1574, + "step": 11057 + }, + { + "epoch": 0.176928, + "grad_norm": 0.92578125, + "learning_rate": 8.29725806451613e-05, + "loss": 0.1741, + "step": 11058 + }, + { + "epoch": 0.176944, + "grad_norm": 0.62890625, + "learning_rate": 8.29709677419355e-05, + "loss": 0.1363, + "step": 11059 + }, + { + "epoch": 0.17696, + "grad_norm": 0.734375, + "learning_rate": 8.296935483870968e-05, + "loss": 0.17, + "step": 11060 + }, + { + "epoch": 0.176976, + "grad_norm": 0.65234375, + "learning_rate": 8.296774193548388e-05, + "loss": 0.1553, + "step": 11061 + }, + { + "epoch": 0.176992, + "grad_norm": 0.66796875, + "learning_rate": 8.296612903225807e-05, + "loss": 0.1515, + "step": 11062 + }, + { + "epoch": 0.177008, + "grad_norm": 0.8828125, + "learning_rate": 8.296451612903227e-05, + "loss": 0.1793, + "step": 11063 + }, + { + "epoch": 0.177024, + "grad_norm": 0.7578125, + "learning_rate": 8.296290322580645e-05, + "loss": 0.1364, + "step": 11064 + }, + { + "epoch": 0.17704, + "grad_norm": 0.6484375, + "learning_rate": 8.296129032258065e-05, + "loss": 0.1641, + "step": 11065 + }, + { + "epoch": 0.177056, + "grad_norm": 0.8984375, + "learning_rate": 8.295967741935484e-05, + "loss": 0.1726, + "step": 11066 + }, + { + "epoch": 0.177072, + "grad_norm": 0.75, + "learning_rate": 8.295806451612904e-05, + "loss": 0.178, + "step": 11067 + }, + { + "epoch": 0.177088, + "grad_norm": 0.60546875, + "learning_rate": 8.295645161290323e-05, + "loss": 0.1309, + "step": 11068 + }, + { + "epoch": 0.177104, + "grad_norm": 0.76953125, + "learning_rate": 8.295483870967741e-05, + "loss": 0.1651, + "step": 11069 + }, + { + "epoch": 0.17712, + "grad_norm": 0.87890625, + "learning_rate": 8.295322580645161e-05, + "loss": 0.1471, + "step": 11070 + }, + { + "epoch": 0.177136, + "grad_norm": 0.78515625, + "learning_rate": 8.295161290322581e-05, + "loss": 0.1622, + "step": 11071 + }, + { + "epoch": 0.177152, + "grad_norm": 1.0078125, + "learning_rate": 8.295000000000001e-05, + "loss": 0.1914, + "step": 11072 + }, + { + "epoch": 0.177168, + "grad_norm": 1.6328125, + "learning_rate": 8.29483870967742e-05, + "loss": 0.2082, + "step": 11073 + }, + { + "epoch": 0.177184, + "grad_norm": 0.61328125, + "learning_rate": 8.29467741935484e-05, + "loss": 0.1259, + "step": 11074 + }, + { + "epoch": 0.1772, + "grad_norm": 0.5703125, + "learning_rate": 8.294516129032258e-05, + "loss": 0.1467, + "step": 11075 + }, + { + "epoch": 0.177216, + "grad_norm": 0.88671875, + "learning_rate": 8.294354838709678e-05, + "loss": 0.207, + "step": 11076 + }, + { + "epoch": 0.177232, + "grad_norm": 0.7734375, + "learning_rate": 8.294193548387097e-05, + "loss": 0.1806, + "step": 11077 + }, + { + "epoch": 0.177248, + "grad_norm": 0.63671875, + "learning_rate": 8.294032258064517e-05, + "loss": 0.163, + "step": 11078 + }, + { + "epoch": 0.177264, + "grad_norm": 0.78125, + "learning_rate": 8.293870967741935e-05, + "loss": 0.1605, + "step": 11079 + }, + { + "epoch": 0.17728, + "grad_norm": 1.1484375, + "learning_rate": 8.293709677419355e-05, + "loss": 0.1561, + "step": 11080 + }, + { + "epoch": 0.177296, + "grad_norm": 0.7734375, + "learning_rate": 8.293548387096774e-05, + "loss": 0.1597, + "step": 11081 + }, + { + "epoch": 0.177312, + "grad_norm": 0.59765625, + "learning_rate": 8.293387096774194e-05, + "loss": 0.145, + "step": 11082 + }, + { + "epoch": 0.177328, + "grad_norm": 1.0390625, + "learning_rate": 8.293225806451614e-05, + "loss": 0.1834, + "step": 11083 + }, + { + "epoch": 0.177344, + "grad_norm": 0.66015625, + "learning_rate": 8.293064516129032e-05, + "loss": 0.1528, + "step": 11084 + }, + { + "epoch": 0.17736, + "grad_norm": 0.6015625, + "learning_rate": 8.292903225806452e-05, + "loss": 0.1665, + "step": 11085 + }, + { + "epoch": 0.177376, + "grad_norm": 0.52734375, + "learning_rate": 8.292741935483871e-05, + "loss": 0.142, + "step": 11086 + }, + { + "epoch": 0.177392, + "grad_norm": 0.640625, + "learning_rate": 8.292580645161291e-05, + "loss": 0.1822, + "step": 11087 + }, + { + "epoch": 0.177408, + "grad_norm": 0.7421875, + "learning_rate": 8.29241935483871e-05, + "loss": 0.1831, + "step": 11088 + }, + { + "epoch": 0.177424, + "grad_norm": 1.0703125, + "learning_rate": 8.29225806451613e-05, + "loss": 0.151, + "step": 11089 + }, + { + "epoch": 0.17744, + "grad_norm": 0.73828125, + "learning_rate": 8.292096774193548e-05, + "loss": 0.1432, + "step": 11090 + }, + { + "epoch": 0.177456, + "grad_norm": 0.56640625, + "learning_rate": 8.291935483870968e-05, + "loss": 0.1889, + "step": 11091 + }, + { + "epoch": 0.177472, + "grad_norm": 1.1328125, + "learning_rate": 8.291774193548387e-05, + "loss": 0.2, + "step": 11092 + }, + { + "epoch": 0.177488, + "grad_norm": 0.6875, + "learning_rate": 8.291612903225807e-05, + "loss": 0.1861, + "step": 11093 + }, + { + "epoch": 0.177504, + "grad_norm": 0.75, + "learning_rate": 8.291451612903227e-05, + "loss": 0.1639, + "step": 11094 + }, + { + "epoch": 0.17752, + "grad_norm": 1.3828125, + "learning_rate": 8.291290322580647e-05, + "loss": 0.1965, + "step": 11095 + }, + { + "epoch": 0.177536, + "grad_norm": 0.59765625, + "learning_rate": 8.291129032258065e-05, + "loss": 0.1205, + "step": 11096 + }, + { + "epoch": 0.177552, + "grad_norm": 0.83984375, + "learning_rate": 8.290967741935485e-05, + "loss": 0.212, + "step": 11097 + }, + { + "epoch": 0.177568, + "grad_norm": 0.83203125, + "learning_rate": 8.290806451612904e-05, + "loss": 0.1621, + "step": 11098 + }, + { + "epoch": 0.177584, + "grad_norm": 0.87890625, + "learning_rate": 8.290645161290322e-05, + "loss": 0.1886, + "step": 11099 + }, + { + "epoch": 0.1776, + "grad_norm": 0.90234375, + "learning_rate": 8.290483870967742e-05, + "loss": 0.1706, + "step": 11100 + }, + { + "epoch": 0.177616, + "grad_norm": 0.7265625, + "learning_rate": 8.290322580645161e-05, + "loss": 0.1471, + "step": 11101 + }, + { + "epoch": 0.177632, + "grad_norm": 1.1328125, + "learning_rate": 8.290161290322581e-05, + "loss": 0.1656, + "step": 11102 + }, + { + "epoch": 0.177648, + "grad_norm": 1.3359375, + "learning_rate": 8.29e-05, + "loss": 0.1816, + "step": 11103 + }, + { + "epoch": 0.177664, + "grad_norm": 1.2734375, + "learning_rate": 8.28983870967742e-05, + "loss": 0.156, + "step": 11104 + }, + { + "epoch": 0.17768, + "grad_norm": 1.0, + "learning_rate": 8.289677419354838e-05, + "loss": 0.1777, + "step": 11105 + }, + { + "epoch": 0.177696, + "grad_norm": 0.96484375, + "learning_rate": 8.289516129032258e-05, + "loss": 0.1211, + "step": 11106 + }, + { + "epoch": 0.177712, + "grad_norm": 0.79296875, + "learning_rate": 8.289354838709678e-05, + "loss": 0.1726, + "step": 11107 + }, + { + "epoch": 0.177728, + "grad_norm": 0.74609375, + "learning_rate": 8.289193548387098e-05, + "loss": 0.1731, + "step": 11108 + }, + { + "epoch": 0.177744, + "grad_norm": 0.8203125, + "learning_rate": 8.289032258064517e-05, + "loss": 0.1931, + "step": 11109 + }, + { + "epoch": 0.17776, + "grad_norm": 0.8046875, + "learning_rate": 8.288870967741937e-05, + "loss": 0.1832, + "step": 11110 + }, + { + "epoch": 0.177776, + "grad_norm": 1.40625, + "learning_rate": 8.288709677419355e-05, + "loss": 0.1527, + "step": 11111 + }, + { + "epoch": 0.177792, + "grad_norm": 0.84765625, + "learning_rate": 8.288548387096775e-05, + "loss": 0.1763, + "step": 11112 + }, + { + "epoch": 0.177808, + "grad_norm": 1.0859375, + "learning_rate": 8.288387096774194e-05, + "loss": 0.1448, + "step": 11113 + }, + { + "epoch": 0.177824, + "grad_norm": 0.9765625, + "learning_rate": 8.288225806451614e-05, + "loss": 0.1513, + "step": 11114 + }, + { + "epoch": 0.17784, + "grad_norm": 1.0078125, + "learning_rate": 8.288064516129032e-05, + "loss": 0.2284, + "step": 11115 + }, + { + "epoch": 0.177856, + "grad_norm": 0.765625, + "learning_rate": 8.287903225806451e-05, + "loss": 0.2118, + "step": 11116 + }, + { + "epoch": 0.177872, + "grad_norm": 0.73046875, + "learning_rate": 8.287741935483871e-05, + "loss": 0.1555, + "step": 11117 + }, + { + "epoch": 0.177888, + "grad_norm": 0.87109375, + "learning_rate": 8.287580645161291e-05, + "loss": 0.1921, + "step": 11118 + }, + { + "epoch": 0.177904, + "grad_norm": 0.57421875, + "learning_rate": 8.287419354838711e-05, + "loss": 0.1525, + "step": 11119 + }, + { + "epoch": 0.17792, + "grad_norm": 0.5078125, + "learning_rate": 8.28725806451613e-05, + "loss": 0.1359, + "step": 11120 + }, + { + "epoch": 0.177936, + "grad_norm": 0.90234375, + "learning_rate": 8.28709677419355e-05, + "loss": 0.1603, + "step": 11121 + }, + { + "epoch": 0.177952, + "grad_norm": 0.75, + "learning_rate": 8.286935483870968e-05, + "loss": 0.1807, + "step": 11122 + }, + { + "epoch": 0.177968, + "grad_norm": 0.79296875, + "learning_rate": 8.286774193548388e-05, + "loss": 0.2083, + "step": 11123 + }, + { + "epoch": 0.177984, + "grad_norm": 0.69921875, + "learning_rate": 8.286612903225807e-05, + "loss": 0.18, + "step": 11124 + }, + { + "epoch": 0.178, + "grad_norm": 0.546875, + "learning_rate": 8.286451612903227e-05, + "loss": 0.152, + "step": 11125 + }, + { + "epoch": 0.178016, + "grad_norm": 0.5, + "learning_rate": 8.286290322580645e-05, + "loss": 0.1605, + "step": 11126 + }, + { + "epoch": 0.178032, + "grad_norm": 0.859375, + "learning_rate": 8.286129032258065e-05, + "loss": 0.1917, + "step": 11127 + }, + { + "epoch": 0.178048, + "grad_norm": 0.6953125, + "learning_rate": 8.285967741935484e-05, + "loss": 0.1792, + "step": 11128 + }, + { + "epoch": 0.178064, + "grad_norm": 0.66796875, + "learning_rate": 8.285806451612904e-05, + "loss": 0.1671, + "step": 11129 + }, + { + "epoch": 0.17808, + "grad_norm": 1.140625, + "learning_rate": 8.285645161290322e-05, + "loss": 0.1663, + "step": 11130 + }, + { + "epoch": 0.178096, + "grad_norm": 0.671875, + "learning_rate": 8.285483870967742e-05, + "loss": 0.1436, + "step": 11131 + }, + { + "epoch": 0.178112, + "grad_norm": 0.99609375, + "learning_rate": 8.285322580645162e-05, + "loss": 0.1621, + "step": 11132 + }, + { + "epoch": 0.178128, + "grad_norm": 0.9453125, + "learning_rate": 8.285161290322581e-05, + "loss": 0.2174, + "step": 11133 + }, + { + "epoch": 0.178144, + "grad_norm": 0.578125, + "learning_rate": 8.285000000000001e-05, + "loss": 0.167, + "step": 11134 + }, + { + "epoch": 0.17816, + "grad_norm": 0.84375, + "learning_rate": 8.28483870967742e-05, + "loss": 0.1472, + "step": 11135 + }, + { + "epoch": 0.178176, + "grad_norm": 0.69921875, + "learning_rate": 8.28467741935484e-05, + "loss": 0.168, + "step": 11136 + }, + { + "epoch": 0.178192, + "grad_norm": 0.8125, + "learning_rate": 8.284516129032258e-05, + "loss": 0.161, + "step": 11137 + }, + { + "epoch": 0.178208, + "grad_norm": 1.09375, + "learning_rate": 8.284354838709678e-05, + "loss": 0.1761, + "step": 11138 + }, + { + "epoch": 0.178224, + "grad_norm": 0.73828125, + "learning_rate": 8.284193548387097e-05, + "loss": 0.1798, + "step": 11139 + }, + { + "epoch": 0.17824, + "grad_norm": 0.828125, + "learning_rate": 8.284032258064517e-05, + "loss": 0.135, + "step": 11140 + }, + { + "epoch": 0.178256, + "grad_norm": 0.890625, + "learning_rate": 8.283870967741935e-05, + "loss": 0.1522, + "step": 11141 + }, + { + "epoch": 0.178272, + "grad_norm": 0.87109375, + "learning_rate": 8.283709677419355e-05, + "loss": 0.2054, + "step": 11142 + }, + { + "epoch": 0.178288, + "grad_norm": 0.7421875, + "learning_rate": 8.283548387096775e-05, + "loss": 0.1662, + "step": 11143 + }, + { + "epoch": 0.178304, + "grad_norm": 0.546875, + "learning_rate": 8.283387096774195e-05, + "loss": 0.1324, + "step": 11144 + }, + { + "epoch": 0.17832, + "grad_norm": 0.890625, + "learning_rate": 8.283225806451614e-05, + "loss": 0.1682, + "step": 11145 + }, + { + "epoch": 0.178336, + "grad_norm": 0.69921875, + "learning_rate": 8.283064516129032e-05, + "loss": 0.159, + "step": 11146 + }, + { + "epoch": 0.178352, + "grad_norm": 0.9375, + "learning_rate": 8.282903225806452e-05, + "loss": 0.176, + "step": 11147 + }, + { + "epoch": 0.178368, + "grad_norm": 0.5703125, + "learning_rate": 8.282741935483871e-05, + "loss": 0.1637, + "step": 11148 + }, + { + "epoch": 0.178384, + "grad_norm": 0.91015625, + "learning_rate": 8.282580645161291e-05, + "loss": 0.1504, + "step": 11149 + }, + { + "epoch": 0.1784, + "grad_norm": 0.7109375, + "learning_rate": 8.28241935483871e-05, + "loss": 0.158, + "step": 11150 + }, + { + "epoch": 0.178416, + "grad_norm": 0.76953125, + "learning_rate": 8.28225806451613e-05, + "loss": 0.1671, + "step": 11151 + }, + { + "epoch": 0.178432, + "grad_norm": 1.078125, + "learning_rate": 8.282096774193548e-05, + "loss": 0.183, + "step": 11152 + }, + { + "epoch": 0.178448, + "grad_norm": 1.0390625, + "learning_rate": 8.281935483870968e-05, + "loss": 0.179, + "step": 11153 + }, + { + "epoch": 0.178464, + "grad_norm": 1.0078125, + "learning_rate": 8.281774193548388e-05, + "loss": 0.1928, + "step": 11154 + }, + { + "epoch": 0.17848, + "grad_norm": 0.51953125, + "learning_rate": 8.281612903225808e-05, + "loss": 0.1329, + "step": 11155 + }, + { + "epoch": 0.178496, + "grad_norm": 0.90234375, + "learning_rate": 8.281451612903226e-05, + "loss": 0.1837, + "step": 11156 + }, + { + "epoch": 0.178512, + "grad_norm": 1.1953125, + "learning_rate": 8.281290322580646e-05, + "loss": 0.1887, + "step": 11157 + }, + { + "epoch": 0.178528, + "grad_norm": 1.2890625, + "learning_rate": 8.281129032258065e-05, + "loss": 0.1677, + "step": 11158 + }, + { + "epoch": 0.178544, + "grad_norm": 0.828125, + "learning_rate": 8.280967741935485e-05, + "loss": 0.1741, + "step": 11159 + }, + { + "epoch": 0.17856, + "grad_norm": 1.1640625, + "learning_rate": 8.280806451612904e-05, + "loss": 0.1551, + "step": 11160 + }, + { + "epoch": 0.178576, + "grad_norm": 1.140625, + "learning_rate": 8.280645161290324e-05, + "loss": 0.125, + "step": 11161 + }, + { + "epoch": 0.178592, + "grad_norm": 0.7734375, + "learning_rate": 8.280483870967742e-05, + "loss": 0.172, + "step": 11162 + }, + { + "epoch": 0.178608, + "grad_norm": 0.7421875, + "learning_rate": 8.280322580645161e-05, + "loss": 0.1735, + "step": 11163 + }, + { + "epoch": 0.178624, + "grad_norm": 0.78515625, + "learning_rate": 8.280161290322581e-05, + "loss": 0.1525, + "step": 11164 + }, + { + "epoch": 0.17864, + "grad_norm": 0.703125, + "learning_rate": 8.28e-05, + "loss": 0.1634, + "step": 11165 + }, + { + "epoch": 0.178656, + "grad_norm": 0.5859375, + "learning_rate": 8.279838709677419e-05, + "loss": 0.1623, + "step": 11166 + }, + { + "epoch": 0.178672, + "grad_norm": 0.82421875, + "learning_rate": 8.279677419354839e-05, + "loss": 0.1322, + "step": 11167 + }, + { + "epoch": 0.178688, + "grad_norm": 0.98046875, + "learning_rate": 8.279516129032259e-05, + "loss": 0.1401, + "step": 11168 + }, + { + "epoch": 0.178704, + "grad_norm": 0.640625, + "learning_rate": 8.279354838709678e-05, + "loss": 0.1778, + "step": 11169 + }, + { + "epoch": 0.17872, + "grad_norm": 0.81640625, + "learning_rate": 8.279193548387098e-05, + "loss": 0.1695, + "step": 11170 + }, + { + "epoch": 0.178736, + "grad_norm": 1.09375, + "learning_rate": 8.279032258064516e-05, + "loss": 0.1829, + "step": 11171 + }, + { + "epoch": 0.178752, + "grad_norm": 1.1640625, + "learning_rate": 8.278870967741936e-05, + "loss": 0.1862, + "step": 11172 + }, + { + "epoch": 0.178768, + "grad_norm": 0.81640625, + "learning_rate": 8.278709677419355e-05, + "loss": 0.1677, + "step": 11173 + }, + { + "epoch": 0.178784, + "grad_norm": 1.0234375, + "learning_rate": 8.278548387096775e-05, + "loss": 0.1581, + "step": 11174 + }, + { + "epoch": 0.1788, + "grad_norm": 1.1015625, + "learning_rate": 8.278387096774194e-05, + "loss": 0.1949, + "step": 11175 + }, + { + "epoch": 0.178816, + "grad_norm": 1.4140625, + "learning_rate": 8.278225806451614e-05, + "loss": 0.1846, + "step": 11176 + }, + { + "epoch": 0.178832, + "grad_norm": 0.6796875, + "learning_rate": 8.278064516129032e-05, + "loss": 0.1605, + "step": 11177 + }, + { + "epoch": 0.178848, + "grad_norm": 0.56640625, + "learning_rate": 8.277903225806452e-05, + "loss": 0.1279, + "step": 11178 + }, + { + "epoch": 0.178864, + "grad_norm": 1.40625, + "learning_rate": 8.277741935483872e-05, + "loss": 0.1997, + "step": 11179 + }, + { + "epoch": 0.17888, + "grad_norm": 0.86328125, + "learning_rate": 8.27758064516129e-05, + "loss": 0.1714, + "step": 11180 + }, + { + "epoch": 0.178896, + "grad_norm": 1.9609375, + "learning_rate": 8.27741935483871e-05, + "loss": 0.2142, + "step": 11181 + }, + { + "epoch": 0.178912, + "grad_norm": 0.9765625, + "learning_rate": 8.277258064516129e-05, + "loss": 0.18, + "step": 11182 + }, + { + "epoch": 0.178928, + "grad_norm": 0.7578125, + "learning_rate": 8.277096774193549e-05, + "loss": 0.2231, + "step": 11183 + }, + { + "epoch": 0.178944, + "grad_norm": 1.1328125, + "learning_rate": 8.276935483870968e-05, + "loss": 0.2158, + "step": 11184 + }, + { + "epoch": 0.17896, + "grad_norm": 0.80859375, + "learning_rate": 8.276774193548388e-05, + "loss": 0.1715, + "step": 11185 + }, + { + "epoch": 0.178976, + "grad_norm": 0.5625, + "learning_rate": 8.276612903225806e-05, + "loss": 0.1511, + "step": 11186 + }, + { + "epoch": 0.178992, + "grad_norm": 0.90625, + "learning_rate": 8.276451612903226e-05, + "loss": 0.1637, + "step": 11187 + }, + { + "epoch": 0.179008, + "grad_norm": 0.62109375, + "learning_rate": 8.276290322580645e-05, + "loss": 0.1556, + "step": 11188 + }, + { + "epoch": 0.179024, + "grad_norm": 0.65625, + "learning_rate": 8.276129032258065e-05, + "loss": 0.1506, + "step": 11189 + }, + { + "epoch": 0.17904, + "grad_norm": 0.7578125, + "learning_rate": 8.275967741935485e-05, + "loss": 0.1475, + "step": 11190 + }, + { + "epoch": 0.179056, + "grad_norm": 0.87109375, + "learning_rate": 8.275806451612903e-05, + "loss": 0.1949, + "step": 11191 + }, + { + "epoch": 0.179072, + "grad_norm": 0.81640625, + "learning_rate": 8.275645161290323e-05, + "loss": 0.1528, + "step": 11192 + }, + { + "epoch": 0.179088, + "grad_norm": 0.9609375, + "learning_rate": 8.275483870967742e-05, + "loss": 0.1713, + "step": 11193 + }, + { + "epoch": 0.179104, + "grad_norm": 0.94921875, + "learning_rate": 8.275322580645162e-05, + "loss": 0.1399, + "step": 11194 + }, + { + "epoch": 0.17912, + "grad_norm": 0.8125, + "learning_rate": 8.27516129032258e-05, + "loss": 0.1482, + "step": 11195 + }, + { + "epoch": 0.179136, + "grad_norm": 0.74609375, + "learning_rate": 8.275e-05, + "loss": 0.1749, + "step": 11196 + }, + { + "epoch": 0.179152, + "grad_norm": 1.125, + "learning_rate": 8.274838709677419e-05, + "loss": 0.1626, + "step": 11197 + }, + { + "epoch": 0.179168, + "grad_norm": 0.83984375, + "learning_rate": 8.274677419354839e-05, + "loss": 0.1741, + "step": 11198 + }, + { + "epoch": 0.179184, + "grad_norm": 1.7578125, + "learning_rate": 8.274516129032258e-05, + "loss": 0.1709, + "step": 11199 + }, + { + "epoch": 0.1792, + "grad_norm": 0.53125, + "learning_rate": 8.274354838709678e-05, + "loss": 0.1531, + "step": 11200 + }, + { + "epoch": 0.179216, + "grad_norm": 0.5703125, + "learning_rate": 8.274193548387096e-05, + "loss": 0.1503, + "step": 11201 + }, + { + "epoch": 0.179232, + "grad_norm": 0.64453125, + "learning_rate": 8.274032258064516e-05, + "loss": 0.1765, + "step": 11202 + }, + { + "epoch": 0.179248, + "grad_norm": 0.76953125, + "learning_rate": 8.273870967741936e-05, + "loss": 0.1385, + "step": 11203 + }, + { + "epoch": 0.179264, + "grad_norm": 1.0703125, + "learning_rate": 8.273709677419356e-05, + "loss": 0.1593, + "step": 11204 + }, + { + "epoch": 0.17928, + "grad_norm": 0.73046875, + "learning_rate": 8.273548387096775e-05, + "loss": 0.1813, + "step": 11205 + }, + { + "epoch": 0.179296, + "grad_norm": 0.85546875, + "learning_rate": 8.273387096774195e-05, + "loss": 0.2135, + "step": 11206 + }, + { + "epoch": 0.179312, + "grad_norm": 1.1640625, + "learning_rate": 8.273225806451613e-05, + "loss": 0.1842, + "step": 11207 + }, + { + "epoch": 0.179328, + "grad_norm": 0.6484375, + "learning_rate": 8.273064516129032e-05, + "loss": 0.1357, + "step": 11208 + }, + { + "epoch": 0.179344, + "grad_norm": 0.56640625, + "learning_rate": 8.272903225806452e-05, + "loss": 0.1457, + "step": 11209 + }, + { + "epoch": 0.17936, + "grad_norm": 0.58203125, + "learning_rate": 8.27274193548387e-05, + "loss": 0.172, + "step": 11210 + }, + { + "epoch": 0.179376, + "grad_norm": 0.58203125, + "learning_rate": 8.27258064516129e-05, + "loss": 0.1401, + "step": 11211 + }, + { + "epoch": 0.179392, + "grad_norm": 0.82421875, + "learning_rate": 8.272419354838709e-05, + "loss": 0.1809, + "step": 11212 + }, + { + "epoch": 0.179408, + "grad_norm": 0.75390625, + "learning_rate": 8.272258064516129e-05, + "loss": 0.1736, + "step": 11213 + }, + { + "epoch": 0.179424, + "grad_norm": 1.1953125, + "learning_rate": 8.272096774193549e-05, + "loss": 0.1919, + "step": 11214 + }, + { + "epoch": 0.17944, + "grad_norm": 0.67578125, + "learning_rate": 8.271935483870969e-05, + "loss": 0.1768, + "step": 11215 + }, + { + "epoch": 0.179456, + "grad_norm": 0.96484375, + "learning_rate": 8.271774193548388e-05, + "loss": 0.2014, + "step": 11216 + }, + { + "epoch": 0.179472, + "grad_norm": 0.5859375, + "learning_rate": 8.271612903225808e-05, + "loss": 0.1585, + "step": 11217 + }, + { + "epoch": 0.179488, + "grad_norm": 0.734375, + "learning_rate": 8.271451612903226e-05, + "loss": 0.1675, + "step": 11218 + }, + { + "epoch": 0.179504, + "grad_norm": 0.87109375, + "learning_rate": 8.271290322580646e-05, + "loss": 0.1853, + "step": 11219 + }, + { + "epoch": 0.17952, + "grad_norm": 0.70703125, + "learning_rate": 8.271129032258065e-05, + "loss": 0.1846, + "step": 11220 + }, + { + "epoch": 0.179536, + "grad_norm": 0.82421875, + "learning_rate": 8.270967741935485e-05, + "loss": 0.1726, + "step": 11221 + }, + { + "epoch": 0.179552, + "grad_norm": 0.62109375, + "learning_rate": 8.270806451612903e-05, + "loss": 0.192, + "step": 11222 + }, + { + "epoch": 0.179568, + "grad_norm": 0.85546875, + "learning_rate": 8.270645161290323e-05, + "loss": 0.1721, + "step": 11223 + }, + { + "epoch": 0.179584, + "grad_norm": 0.83203125, + "learning_rate": 8.270483870967742e-05, + "loss": 0.1692, + "step": 11224 + }, + { + "epoch": 0.1796, + "grad_norm": 0.8984375, + "learning_rate": 8.27032258064516e-05, + "loss": 0.1524, + "step": 11225 + }, + { + "epoch": 0.179616, + "grad_norm": 0.52734375, + "learning_rate": 8.27016129032258e-05, + "loss": 0.1716, + "step": 11226 + }, + { + "epoch": 0.179632, + "grad_norm": 0.94140625, + "learning_rate": 8.27e-05, + "loss": 0.1486, + "step": 11227 + }, + { + "epoch": 0.179648, + "grad_norm": 0.58984375, + "learning_rate": 8.26983870967742e-05, + "loss": 0.1816, + "step": 11228 + }, + { + "epoch": 0.179664, + "grad_norm": 0.90625, + "learning_rate": 8.269677419354839e-05, + "loss": 0.1824, + "step": 11229 + }, + { + "epoch": 0.17968, + "grad_norm": 0.99609375, + "learning_rate": 8.269516129032259e-05, + "loss": 0.2031, + "step": 11230 + }, + { + "epoch": 0.179696, + "grad_norm": 1.8125, + "learning_rate": 8.269354838709678e-05, + "loss": 0.1566, + "step": 11231 + }, + { + "epoch": 0.179712, + "grad_norm": 0.65234375, + "learning_rate": 8.269193548387098e-05, + "loss": 0.1797, + "step": 11232 + }, + { + "epoch": 0.179728, + "grad_norm": 1.125, + "learning_rate": 8.269032258064516e-05, + "loss": 0.1737, + "step": 11233 + }, + { + "epoch": 0.179744, + "grad_norm": 0.78125, + "learning_rate": 8.268870967741936e-05, + "loss": 0.1695, + "step": 11234 + }, + { + "epoch": 0.17976, + "grad_norm": 0.7421875, + "learning_rate": 8.268709677419355e-05, + "loss": 0.1755, + "step": 11235 + }, + { + "epoch": 0.179776, + "grad_norm": 0.72265625, + "learning_rate": 8.268548387096775e-05, + "loss": 0.1534, + "step": 11236 + }, + { + "epoch": 0.179792, + "grad_norm": 0.5234375, + "learning_rate": 8.268387096774193e-05, + "loss": 0.1491, + "step": 11237 + }, + { + "epoch": 0.179808, + "grad_norm": 0.83984375, + "learning_rate": 8.268225806451613e-05, + "loss": 0.1764, + "step": 11238 + }, + { + "epoch": 0.179824, + "grad_norm": 0.7265625, + "learning_rate": 8.268064516129033e-05, + "loss": 0.17, + "step": 11239 + }, + { + "epoch": 0.17984, + "grad_norm": 0.71484375, + "learning_rate": 8.267903225806452e-05, + "loss": 0.2086, + "step": 11240 + }, + { + "epoch": 0.179856, + "grad_norm": 1.015625, + "learning_rate": 8.267741935483872e-05, + "loss": 0.1751, + "step": 11241 + }, + { + "epoch": 0.179872, + "grad_norm": 0.72265625, + "learning_rate": 8.26758064516129e-05, + "loss": 0.1844, + "step": 11242 + }, + { + "epoch": 0.179888, + "grad_norm": 1.1015625, + "learning_rate": 8.26741935483871e-05, + "loss": 0.1966, + "step": 11243 + }, + { + "epoch": 0.179904, + "grad_norm": 0.90625, + "learning_rate": 8.267258064516129e-05, + "loss": 0.2022, + "step": 11244 + }, + { + "epoch": 0.17992, + "grad_norm": 0.71875, + "learning_rate": 8.267096774193549e-05, + "loss": 0.2037, + "step": 11245 + }, + { + "epoch": 0.179936, + "grad_norm": 0.83984375, + "learning_rate": 8.266935483870968e-05, + "loss": 0.1745, + "step": 11246 + }, + { + "epoch": 0.179952, + "grad_norm": 0.8671875, + "learning_rate": 8.266774193548388e-05, + "loss": 0.1558, + "step": 11247 + }, + { + "epoch": 0.179968, + "grad_norm": 0.72265625, + "learning_rate": 8.266612903225806e-05, + "loss": 0.186, + "step": 11248 + }, + { + "epoch": 0.179984, + "grad_norm": 0.921875, + "learning_rate": 8.266451612903226e-05, + "loss": 0.2319, + "step": 11249 + }, + { + "epoch": 0.18, + "grad_norm": 1.3671875, + "learning_rate": 8.266290322580646e-05, + "loss": 0.1753, + "step": 11250 + }, + { + "epoch": 0.180016, + "grad_norm": 0.9609375, + "learning_rate": 8.266129032258066e-05, + "loss": 0.246, + "step": 11251 + }, + { + "epoch": 0.180032, + "grad_norm": 0.7578125, + "learning_rate": 8.265967741935485e-05, + "loss": 0.1833, + "step": 11252 + }, + { + "epoch": 0.180048, + "grad_norm": 0.87109375, + "learning_rate": 8.265806451612905e-05, + "loss": 0.2222, + "step": 11253 + }, + { + "epoch": 0.180064, + "grad_norm": 0.70703125, + "learning_rate": 8.265645161290323e-05, + "loss": 0.1822, + "step": 11254 + }, + { + "epoch": 0.18008, + "grad_norm": 0.7734375, + "learning_rate": 8.265483870967742e-05, + "loss": 0.1517, + "step": 11255 + }, + { + "epoch": 0.180096, + "grad_norm": 1.0078125, + "learning_rate": 8.265322580645162e-05, + "loss": 0.1539, + "step": 11256 + }, + { + "epoch": 0.180112, + "grad_norm": 1.4609375, + "learning_rate": 8.26516129032258e-05, + "loss": 0.1883, + "step": 11257 + }, + { + "epoch": 0.180128, + "grad_norm": 0.63671875, + "learning_rate": 8.265e-05, + "loss": 0.1573, + "step": 11258 + }, + { + "epoch": 0.180144, + "grad_norm": 0.984375, + "learning_rate": 8.264838709677419e-05, + "loss": 0.1335, + "step": 11259 + }, + { + "epoch": 0.18016, + "grad_norm": 0.91796875, + "learning_rate": 8.264677419354839e-05, + "loss": 0.1822, + "step": 11260 + }, + { + "epoch": 0.180176, + "grad_norm": 0.8359375, + "learning_rate": 8.264516129032257e-05, + "loss": 0.1995, + "step": 11261 + }, + { + "epoch": 0.180192, + "grad_norm": 0.81640625, + "learning_rate": 8.264354838709677e-05, + "loss": 0.1483, + "step": 11262 + }, + { + "epoch": 0.180208, + "grad_norm": 0.91015625, + "learning_rate": 8.264193548387097e-05, + "loss": 0.1974, + "step": 11263 + }, + { + "epoch": 0.180224, + "grad_norm": 0.68359375, + "learning_rate": 8.264032258064517e-05, + "loss": 0.1474, + "step": 11264 + }, + { + "epoch": 0.18024, + "grad_norm": 0.8203125, + "learning_rate": 8.263870967741936e-05, + "loss": 0.1887, + "step": 11265 + }, + { + "epoch": 0.180256, + "grad_norm": 0.82421875, + "learning_rate": 8.263709677419356e-05, + "loss": 0.1644, + "step": 11266 + }, + { + "epoch": 0.180272, + "grad_norm": 1.265625, + "learning_rate": 8.263548387096775e-05, + "loss": 0.1589, + "step": 11267 + }, + { + "epoch": 0.180288, + "grad_norm": 0.91796875, + "learning_rate": 8.263387096774195e-05, + "loss": 0.1827, + "step": 11268 + }, + { + "epoch": 0.180304, + "grad_norm": 1.3984375, + "learning_rate": 8.263225806451613e-05, + "loss": 0.2368, + "step": 11269 + }, + { + "epoch": 0.18032, + "grad_norm": 0.98828125, + "learning_rate": 8.263064516129033e-05, + "loss": 0.1665, + "step": 11270 + }, + { + "epoch": 0.180336, + "grad_norm": 0.7421875, + "learning_rate": 8.262903225806452e-05, + "loss": 0.1581, + "step": 11271 + }, + { + "epoch": 0.180352, + "grad_norm": 1.0234375, + "learning_rate": 8.26274193548387e-05, + "loss": 0.1985, + "step": 11272 + }, + { + "epoch": 0.180368, + "grad_norm": 0.67578125, + "learning_rate": 8.26258064516129e-05, + "loss": 0.1557, + "step": 11273 + }, + { + "epoch": 0.180384, + "grad_norm": 0.8671875, + "learning_rate": 8.26241935483871e-05, + "loss": 0.1656, + "step": 11274 + }, + { + "epoch": 0.1804, + "grad_norm": 0.73046875, + "learning_rate": 8.26225806451613e-05, + "loss": 0.1624, + "step": 11275 + }, + { + "epoch": 0.180416, + "grad_norm": 0.8203125, + "learning_rate": 8.262096774193549e-05, + "loss": 0.1692, + "step": 11276 + }, + { + "epoch": 0.180432, + "grad_norm": 1.3671875, + "learning_rate": 8.261935483870969e-05, + "loss": 0.1547, + "step": 11277 + }, + { + "epoch": 0.180448, + "grad_norm": 0.94140625, + "learning_rate": 8.261774193548387e-05, + "loss": 0.139, + "step": 11278 + }, + { + "epoch": 0.180464, + "grad_norm": 1.0625, + "learning_rate": 8.261612903225807e-05, + "loss": 0.1645, + "step": 11279 + }, + { + "epoch": 0.18048, + "grad_norm": 0.73046875, + "learning_rate": 8.261451612903226e-05, + "loss": 0.1484, + "step": 11280 + }, + { + "epoch": 0.180496, + "grad_norm": 1.1171875, + "learning_rate": 8.261290322580646e-05, + "loss": 0.1516, + "step": 11281 + }, + { + "epoch": 0.180512, + "grad_norm": 0.61328125, + "learning_rate": 8.261129032258065e-05, + "loss": 0.1326, + "step": 11282 + }, + { + "epoch": 0.180528, + "grad_norm": 0.7109375, + "learning_rate": 8.260967741935484e-05, + "loss": 0.1924, + "step": 11283 + }, + { + "epoch": 0.180544, + "grad_norm": 0.78515625, + "learning_rate": 8.260806451612903e-05, + "loss": 0.1628, + "step": 11284 + }, + { + "epoch": 0.18056, + "grad_norm": 0.625, + "learning_rate": 8.260645161290323e-05, + "loss": 0.1508, + "step": 11285 + }, + { + "epoch": 0.180576, + "grad_norm": 0.60546875, + "learning_rate": 8.260483870967743e-05, + "loss": 0.153, + "step": 11286 + }, + { + "epoch": 0.180592, + "grad_norm": 1.3515625, + "learning_rate": 8.260322580645162e-05, + "loss": 0.2099, + "step": 11287 + }, + { + "epoch": 0.180608, + "grad_norm": 1.078125, + "learning_rate": 8.260161290322582e-05, + "loss": 0.1288, + "step": 11288 + }, + { + "epoch": 0.180624, + "grad_norm": 1.078125, + "learning_rate": 8.26e-05, + "loss": 0.1583, + "step": 11289 + }, + { + "epoch": 0.18064, + "grad_norm": 0.72265625, + "learning_rate": 8.25983870967742e-05, + "loss": 0.1501, + "step": 11290 + }, + { + "epoch": 0.180656, + "grad_norm": 0.6015625, + "learning_rate": 8.259677419354839e-05, + "loss": 0.1543, + "step": 11291 + }, + { + "epoch": 0.180672, + "grad_norm": 0.7578125, + "learning_rate": 8.259516129032259e-05, + "loss": 0.1502, + "step": 11292 + }, + { + "epoch": 0.180688, + "grad_norm": 0.80078125, + "learning_rate": 8.259354838709677e-05, + "loss": 0.1377, + "step": 11293 + }, + { + "epoch": 0.180704, + "grad_norm": 1.0, + "learning_rate": 8.259193548387097e-05, + "loss": 0.1631, + "step": 11294 + }, + { + "epoch": 0.18072, + "grad_norm": 0.83203125, + "learning_rate": 8.259032258064516e-05, + "loss": 0.1536, + "step": 11295 + }, + { + "epoch": 0.180736, + "grad_norm": 0.78515625, + "learning_rate": 8.258870967741936e-05, + "loss": 0.1325, + "step": 11296 + }, + { + "epoch": 0.180752, + "grad_norm": 2.3125, + "learning_rate": 8.258709677419354e-05, + "loss": 0.2071, + "step": 11297 + }, + { + "epoch": 0.180768, + "grad_norm": 0.8046875, + "learning_rate": 8.258548387096774e-05, + "loss": 0.1728, + "step": 11298 + }, + { + "epoch": 0.180784, + "grad_norm": 0.7734375, + "learning_rate": 8.258387096774194e-05, + "loss": 0.1498, + "step": 11299 + }, + { + "epoch": 0.1808, + "grad_norm": 0.5234375, + "learning_rate": 8.258225806451614e-05, + "loss": 0.1481, + "step": 11300 + }, + { + "epoch": 0.180816, + "grad_norm": 0.6328125, + "learning_rate": 8.258064516129033e-05, + "loss": 0.1495, + "step": 11301 + }, + { + "epoch": 0.180832, + "grad_norm": 1.140625, + "learning_rate": 8.257903225806452e-05, + "loss": 0.1466, + "step": 11302 + }, + { + "epoch": 0.180848, + "grad_norm": 0.83203125, + "learning_rate": 8.257741935483872e-05, + "loss": 0.1617, + "step": 11303 + }, + { + "epoch": 0.180864, + "grad_norm": 0.546875, + "learning_rate": 8.25758064516129e-05, + "loss": 0.15, + "step": 11304 + }, + { + "epoch": 0.18088, + "grad_norm": 0.6875, + "learning_rate": 8.25741935483871e-05, + "loss": 0.1733, + "step": 11305 + }, + { + "epoch": 0.180896, + "grad_norm": 0.8359375, + "learning_rate": 8.257258064516129e-05, + "loss": 0.2078, + "step": 11306 + }, + { + "epoch": 0.180912, + "grad_norm": 0.59765625, + "learning_rate": 8.257096774193549e-05, + "loss": 0.1691, + "step": 11307 + }, + { + "epoch": 0.180928, + "grad_norm": 0.9921875, + "learning_rate": 8.256935483870967e-05, + "loss": 0.2108, + "step": 11308 + }, + { + "epoch": 0.180944, + "grad_norm": 0.78515625, + "learning_rate": 8.256774193548387e-05, + "loss": 0.1573, + "step": 11309 + }, + { + "epoch": 0.18096, + "grad_norm": 0.61328125, + "learning_rate": 8.256612903225807e-05, + "loss": 0.1921, + "step": 11310 + }, + { + "epoch": 0.180976, + "grad_norm": 1.1328125, + "learning_rate": 8.256451612903227e-05, + "loss": 0.2095, + "step": 11311 + }, + { + "epoch": 0.180992, + "grad_norm": 0.98828125, + "learning_rate": 8.256290322580646e-05, + "loss": 0.1787, + "step": 11312 + }, + { + "epoch": 0.181008, + "grad_norm": 0.7578125, + "learning_rate": 8.256129032258066e-05, + "loss": 0.1458, + "step": 11313 + }, + { + "epoch": 0.181024, + "grad_norm": 0.66796875, + "learning_rate": 8.255967741935484e-05, + "loss": 0.1601, + "step": 11314 + }, + { + "epoch": 0.18104, + "grad_norm": 0.69140625, + "learning_rate": 8.255806451612904e-05, + "loss": 0.1603, + "step": 11315 + }, + { + "epoch": 0.181056, + "grad_norm": 0.72265625, + "learning_rate": 8.255645161290323e-05, + "loss": 0.1758, + "step": 11316 + }, + { + "epoch": 0.181072, + "grad_norm": 1.09375, + "learning_rate": 8.255483870967742e-05, + "loss": 0.1779, + "step": 11317 + }, + { + "epoch": 0.181088, + "grad_norm": 0.87890625, + "learning_rate": 8.255322580645162e-05, + "loss": 0.2164, + "step": 11318 + }, + { + "epoch": 0.181104, + "grad_norm": 1.15625, + "learning_rate": 8.25516129032258e-05, + "loss": 0.1586, + "step": 11319 + }, + { + "epoch": 0.18112, + "grad_norm": 0.69140625, + "learning_rate": 8.255e-05, + "loss": 0.1347, + "step": 11320 + }, + { + "epoch": 0.181136, + "grad_norm": 0.99609375, + "learning_rate": 8.254838709677419e-05, + "loss": 0.1841, + "step": 11321 + }, + { + "epoch": 0.181152, + "grad_norm": 1.0234375, + "learning_rate": 8.254677419354839e-05, + "loss": 0.1501, + "step": 11322 + }, + { + "epoch": 0.181168, + "grad_norm": 0.93359375, + "learning_rate": 8.254516129032259e-05, + "loss": 0.1713, + "step": 11323 + }, + { + "epoch": 0.181184, + "grad_norm": 1.1015625, + "learning_rate": 8.254354838709679e-05, + "loss": 0.2151, + "step": 11324 + }, + { + "epoch": 0.1812, + "grad_norm": 0.7421875, + "learning_rate": 8.254193548387097e-05, + "loss": 0.1769, + "step": 11325 + }, + { + "epoch": 0.181216, + "grad_norm": 0.91796875, + "learning_rate": 8.254032258064517e-05, + "loss": 0.1683, + "step": 11326 + }, + { + "epoch": 0.181232, + "grad_norm": 0.84765625, + "learning_rate": 8.253870967741936e-05, + "loss": 0.1415, + "step": 11327 + }, + { + "epoch": 0.181248, + "grad_norm": 0.5546875, + "learning_rate": 8.253709677419356e-05, + "loss": 0.1516, + "step": 11328 + }, + { + "epoch": 0.181264, + "grad_norm": 0.7890625, + "learning_rate": 8.253548387096774e-05, + "loss": 0.178, + "step": 11329 + }, + { + "epoch": 0.18128, + "grad_norm": 0.83984375, + "learning_rate": 8.253387096774194e-05, + "loss": 0.1873, + "step": 11330 + }, + { + "epoch": 0.181296, + "grad_norm": 1.046875, + "learning_rate": 8.253225806451613e-05, + "loss": 0.1849, + "step": 11331 + }, + { + "epoch": 0.181312, + "grad_norm": 1.8203125, + "learning_rate": 8.253064516129033e-05, + "loss": 0.1874, + "step": 11332 + }, + { + "epoch": 0.181328, + "grad_norm": 0.70703125, + "learning_rate": 8.252903225806451e-05, + "loss": 0.1907, + "step": 11333 + }, + { + "epoch": 0.181344, + "grad_norm": 0.77734375, + "learning_rate": 8.252741935483871e-05, + "loss": 0.2005, + "step": 11334 + }, + { + "epoch": 0.18136, + "grad_norm": 1.0703125, + "learning_rate": 8.252580645161291e-05, + "loss": 0.1926, + "step": 11335 + }, + { + "epoch": 0.181376, + "grad_norm": 0.8046875, + "learning_rate": 8.25241935483871e-05, + "loss": 0.1668, + "step": 11336 + }, + { + "epoch": 0.181392, + "grad_norm": 1.0546875, + "learning_rate": 8.25225806451613e-05, + "loss": 0.1845, + "step": 11337 + }, + { + "epoch": 0.181408, + "grad_norm": 0.69921875, + "learning_rate": 8.252096774193549e-05, + "loss": 0.1641, + "step": 11338 + }, + { + "epoch": 0.181424, + "grad_norm": 0.92578125, + "learning_rate": 8.251935483870969e-05, + "loss": 0.193, + "step": 11339 + }, + { + "epoch": 0.18144, + "grad_norm": 0.75390625, + "learning_rate": 8.251774193548387e-05, + "loss": 0.1351, + "step": 11340 + }, + { + "epoch": 0.181456, + "grad_norm": 0.65625, + "learning_rate": 8.251612903225807e-05, + "loss": 0.1392, + "step": 11341 + }, + { + "epoch": 0.181472, + "grad_norm": 0.66796875, + "learning_rate": 8.251451612903226e-05, + "loss": 0.1739, + "step": 11342 + }, + { + "epoch": 0.181488, + "grad_norm": 0.80078125, + "learning_rate": 8.251290322580646e-05, + "loss": 0.206, + "step": 11343 + }, + { + "epoch": 0.181504, + "grad_norm": 0.73046875, + "learning_rate": 8.251129032258064e-05, + "loss": 0.1728, + "step": 11344 + }, + { + "epoch": 0.18152, + "grad_norm": 1.0703125, + "learning_rate": 8.250967741935484e-05, + "loss": 0.1917, + "step": 11345 + }, + { + "epoch": 0.181536, + "grad_norm": 0.90625, + "learning_rate": 8.250806451612904e-05, + "loss": 0.1619, + "step": 11346 + }, + { + "epoch": 0.181552, + "grad_norm": 0.85546875, + "learning_rate": 8.250645161290324e-05, + "loss": 0.1392, + "step": 11347 + }, + { + "epoch": 0.181568, + "grad_norm": 0.88671875, + "learning_rate": 8.250483870967743e-05, + "loss": 0.1619, + "step": 11348 + }, + { + "epoch": 0.181584, + "grad_norm": 0.58984375, + "learning_rate": 8.250322580645161e-05, + "loss": 0.1786, + "step": 11349 + }, + { + "epoch": 0.1816, + "grad_norm": 0.85546875, + "learning_rate": 8.250161290322581e-05, + "loss": 0.1635, + "step": 11350 + }, + { + "epoch": 0.181616, + "grad_norm": 0.7109375, + "learning_rate": 8.25e-05, + "loss": 0.199, + "step": 11351 + }, + { + "epoch": 0.181632, + "grad_norm": 0.6875, + "learning_rate": 8.24983870967742e-05, + "loss": 0.1499, + "step": 11352 + }, + { + "epoch": 0.181648, + "grad_norm": 0.546875, + "learning_rate": 8.249677419354839e-05, + "loss": 0.1685, + "step": 11353 + }, + { + "epoch": 0.181664, + "grad_norm": 0.51953125, + "learning_rate": 8.249516129032258e-05, + "loss": 0.1486, + "step": 11354 + }, + { + "epoch": 0.18168, + "grad_norm": 0.69921875, + "learning_rate": 8.249354838709677e-05, + "loss": 0.1538, + "step": 11355 + }, + { + "epoch": 0.181696, + "grad_norm": 0.68359375, + "learning_rate": 8.249193548387097e-05, + "loss": 0.1703, + "step": 11356 + }, + { + "epoch": 0.181712, + "grad_norm": 1.0, + "learning_rate": 8.249032258064516e-05, + "loss": 0.1854, + "step": 11357 + }, + { + "epoch": 0.181728, + "grad_norm": 0.95703125, + "learning_rate": 8.248870967741936e-05, + "loss": 0.2263, + "step": 11358 + }, + { + "epoch": 0.181744, + "grad_norm": 0.7734375, + "learning_rate": 8.248709677419356e-05, + "loss": 0.1946, + "step": 11359 + }, + { + "epoch": 0.18176, + "grad_norm": 0.96875, + "learning_rate": 8.248548387096776e-05, + "loss": 0.1776, + "step": 11360 + }, + { + "epoch": 0.181776, + "grad_norm": 0.63671875, + "learning_rate": 8.248387096774194e-05, + "loss": 0.1848, + "step": 11361 + }, + { + "epoch": 0.181792, + "grad_norm": 0.8671875, + "learning_rate": 8.248225806451614e-05, + "loss": 0.2029, + "step": 11362 + }, + { + "epoch": 0.181808, + "grad_norm": 1.0625, + "learning_rate": 8.248064516129033e-05, + "loss": 0.1246, + "step": 11363 + }, + { + "epoch": 0.181824, + "grad_norm": 0.76171875, + "learning_rate": 8.247903225806451e-05, + "loss": 0.1753, + "step": 11364 + }, + { + "epoch": 0.18184, + "grad_norm": 1.25, + "learning_rate": 8.247741935483871e-05, + "loss": 0.1777, + "step": 11365 + }, + { + "epoch": 0.181856, + "grad_norm": 1.0078125, + "learning_rate": 8.24758064516129e-05, + "loss": 0.2501, + "step": 11366 + }, + { + "epoch": 0.181872, + "grad_norm": 0.8515625, + "learning_rate": 8.24741935483871e-05, + "loss": 0.1518, + "step": 11367 + }, + { + "epoch": 0.181888, + "grad_norm": 0.875, + "learning_rate": 8.247258064516128e-05, + "loss": 0.1714, + "step": 11368 + }, + { + "epoch": 0.181904, + "grad_norm": 0.8359375, + "learning_rate": 8.247096774193548e-05, + "loss": 0.1499, + "step": 11369 + }, + { + "epoch": 0.18192, + "grad_norm": 0.96484375, + "learning_rate": 8.246935483870968e-05, + "loss": 0.1511, + "step": 11370 + }, + { + "epoch": 0.181936, + "grad_norm": 0.671875, + "learning_rate": 8.246774193548388e-05, + "loss": 0.1466, + "step": 11371 + }, + { + "epoch": 0.181952, + "grad_norm": 1.1484375, + "learning_rate": 8.246612903225807e-05, + "loss": 0.1603, + "step": 11372 + }, + { + "epoch": 0.181968, + "grad_norm": 0.87890625, + "learning_rate": 8.246451612903227e-05, + "loss": 0.1873, + "step": 11373 + }, + { + "epoch": 0.181984, + "grad_norm": 1.3828125, + "learning_rate": 8.246290322580646e-05, + "loss": 0.1543, + "step": 11374 + }, + { + "epoch": 0.182, + "grad_norm": 0.6875, + "learning_rate": 8.246129032258066e-05, + "loss": 0.1487, + "step": 11375 + }, + { + "epoch": 0.182016, + "grad_norm": 0.64453125, + "learning_rate": 8.245967741935484e-05, + "loss": 0.1698, + "step": 11376 + }, + { + "epoch": 0.182032, + "grad_norm": 0.94140625, + "learning_rate": 8.245806451612904e-05, + "loss": 0.152, + "step": 11377 + }, + { + "epoch": 0.182048, + "grad_norm": 0.79296875, + "learning_rate": 8.245645161290323e-05, + "loss": 0.1555, + "step": 11378 + }, + { + "epoch": 0.182064, + "grad_norm": 0.97265625, + "learning_rate": 8.245483870967743e-05, + "loss": 0.1295, + "step": 11379 + }, + { + "epoch": 0.18208, + "grad_norm": 1.203125, + "learning_rate": 8.245322580645161e-05, + "loss": 0.2041, + "step": 11380 + }, + { + "epoch": 0.182096, + "grad_norm": 0.65234375, + "learning_rate": 8.245161290322581e-05, + "loss": 0.1636, + "step": 11381 + }, + { + "epoch": 0.182112, + "grad_norm": 0.68359375, + "learning_rate": 8.245e-05, + "loss": 0.1728, + "step": 11382 + }, + { + "epoch": 0.182128, + "grad_norm": 0.87890625, + "learning_rate": 8.24483870967742e-05, + "loss": 0.157, + "step": 11383 + }, + { + "epoch": 0.182144, + "grad_norm": 0.921875, + "learning_rate": 8.24467741935484e-05, + "loss": 0.1629, + "step": 11384 + }, + { + "epoch": 0.18216, + "grad_norm": 1.3359375, + "learning_rate": 8.244516129032258e-05, + "loss": 0.1785, + "step": 11385 + }, + { + "epoch": 0.182176, + "grad_norm": 1.4375, + "learning_rate": 8.244354838709678e-05, + "loss": 0.1992, + "step": 11386 + }, + { + "epoch": 0.182192, + "grad_norm": 2.03125, + "learning_rate": 8.244193548387097e-05, + "loss": 0.1542, + "step": 11387 + }, + { + "epoch": 0.182208, + "grad_norm": 0.61328125, + "learning_rate": 8.244032258064517e-05, + "loss": 0.1629, + "step": 11388 + }, + { + "epoch": 0.182224, + "grad_norm": 0.96875, + "learning_rate": 8.243870967741936e-05, + "loss": 0.1643, + "step": 11389 + }, + { + "epoch": 0.18224, + "grad_norm": 0.69140625, + "learning_rate": 8.243709677419355e-05, + "loss": 0.1749, + "step": 11390 + }, + { + "epoch": 0.182256, + "grad_norm": 1.09375, + "learning_rate": 8.243548387096774e-05, + "loss": 0.2004, + "step": 11391 + }, + { + "epoch": 0.182272, + "grad_norm": 0.7109375, + "learning_rate": 8.243387096774194e-05, + "loss": 0.1808, + "step": 11392 + }, + { + "epoch": 0.182288, + "grad_norm": 0.62890625, + "learning_rate": 8.243225806451613e-05, + "loss": 0.1697, + "step": 11393 + }, + { + "epoch": 0.182304, + "grad_norm": 0.828125, + "learning_rate": 8.243064516129033e-05, + "loss": 0.1845, + "step": 11394 + }, + { + "epoch": 0.18232, + "grad_norm": 1.390625, + "learning_rate": 8.242903225806453e-05, + "loss": 0.1725, + "step": 11395 + }, + { + "epoch": 0.182336, + "grad_norm": 0.84765625, + "learning_rate": 8.242741935483871e-05, + "loss": 0.193, + "step": 11396 + }, + { + "epoch": 0.182352, + "grad_norm": 0.890625, + "learning_rate": 8.242580645161291e-05, + "loss": 0.1471, + "step": 11397 + }, + { + "epoch": 0.182368, + "grad_norm": 0.6796875, + "learning_rate": 8.24241935483871e-05, + "loss": 0.1845, + "step": 11398 + }, + { + "epoch": 0.182384, + "grad_norm": 0.83984375, + "learning_rate": 8.24225806451613e-05, + "loss": 0.1479, + "step": 11399 + }, + { + "epoch": 0.1824, + "grad_norm": 0.80078125, + "learning_rate": 8.242096774193548e-05, + "loss": 0.1244, + "step": 11400 + }, + { + "epoch": 0.182416, + "grad_norm": 1.1015625, + "learning_rate": 8.241935483870968e-05, + "loss": 0.1454, + "step": 11401 + }, + { + "epoch": 0.182432, + "grad_norm": 1.015625, + "learning_rate": 8.241774193548387e-05, + "loss": 0.1831, + "step": 11402 + }, + { + "epoch": 0.182448, + "grad_norm": 0.58984375, + "learning_rate": 8.241612903225807e-05, + "loss": 0.1494, + "step": 11403 + }, + { + "epoch": 0.182464, + "grad_norm": 0.84765625, + "learning_rate": 8.241451612903225e-05, + "loss": 0.1608, + "step": 11404 + }, + { + "epoch": 0.18248, + "grad_norm": 1.0078125, + "learning_rate": 8.241290322580645e-05, + "loss": 0.1715, + "step": 11405 + }, + { + "epoch": 0.182496, + "grad_norm": 1.46875, + "learning_rate": 8.241129032258065e-05, + "loss": 0.1739, + "step": 11406 + }, + { + "epoch": 0.182512, + "grad_norm": 0.8671875, + "learning_rate": 8.240967741935485e-05, + "loss": 0.1785, + "step": 11407 + }, + { + "epoch": 0.182528, + "grad_norm": 1.2421875, + "learning_rate": 8.240806451612904e-05, + "loss": 0.1973, + "step": 11408 + }, + { + "epoch": 0.182544, + "grad_norm": 0.546875, + "learning_rate": 8.240645161290324e-05, + "loss": 0.157, + "step": 11409 + }, + { + "epoch": 0.18256, + "grad_norm": 0.71875, + "learning_rate": 8.240483870967743e-05, + "loss": 0.1898, + "step": 11410 + }, + { + "epoch": 0.182576, + "grad_norm": 0.53125, + "learning_rate": 8.240322580645161e-05, + "loss": 0.1539, + "step": 11411 + }, + { + "epoch": 0.182592, + "grad_norm": 0.96484375, + "learning_rate": 8.240161290322581e-05, + "loss": 0.1618, + "step": 11412 + }, + { + "epoch": 0.182608, + "grad_norm": 0.85546875, + "learning_rate": 8.24e-05, + "loss": 0.1687, + "step": 11413 + }, + { + "epoch": 0.182624, + "grad_norm": 1.046875, + "learning_rate": 8.23983870967742e-05, + "loss": 0.184, + "step": 11414 + }, + { + "epoch": 0.18264, + "grad_norm": 0.73828125, + "learning_rate": 8.239677419354838e-05, + "loss": 0.1592, + "step": 11415 + }, + { + "epoch": 0.182656, + "grad_norm": 0.68359375, + "learning_rate": 8.239516129032258e-05, + "loss": 0.1784, + "step": 11416 + }, + { + "epoch": 0.182672, + "grad_norm": 0.625, + "learning_rate": 8.239354838709677e-05, + "loss": 0.1511, + "step": 11417 + }, + { + "epoch": 0.182688, + "grad_norm": 0.74609375, + "learning_rate": 8.239193548387097e-05, + "loss": 0.1645, + "step": 11418 + }, + { + "epoch": 0.182704, + "grad_norm": 0.75, + "learning_rate": 8.239032258064517e-05, + "loss": 0.2067, + "step": 11419 + }, + { + "epoch": 0.18272, + "grad_norm": 1.2265625, + "learning_rate": 8.238870967741937e-05, + "loss": 0.1807, + "step": 11420 + }, + { + "epoch": 0.182736, + "grad_norm": 0.765625, + "learning_rate": 8.238709677419355e-05, + "loss": 0.1804, + "step": 11421 + }, + { + "epoch": 0.182752, + "grad_norm": 0.5546875, + "learning_rate": 8.238548387096775e-05, + "loss": 0.2026, + "step": 11422 + }, + { + "epoch": 0.182768, + "grad_norm": 0.71484375, + "learning_rate": 8.238387096774194e-05, + "loss": 0.1541, + "step": 11423 + }, + { + "epoch": 0.182784, + "grad_norm": 0.67578125, + "learning_rate": 8.238225806451614e-05, + "loss": 0.1664, + "step": 11424 + }, + { + "epoch": 0.1828, + "grad_norm": 0.66015625, + "learning_rate": 8.238064516129032e-05, + "loss": 0.1748, + "step": 11425 + }, + { + "epoch": 0.182816, + "grad_norm": 0.58203125, + "learning_rate": 8.237903225806451e-05, + "loss": 0.1922, + "step": 11426 + }, + { + "epoch": 0.182832, + "grad_norm": 0.87109375, + "learning_rate": 8.237741935483871e-05, + "loss": 0.1919, + "step": 11427 + }, + { + "epoch": 0.182848, + "grad_norm": 0.8203125, + "learning_rate": 8.23758064516129e-05, + "loss": 0.1406, + "step": 11428 + }, + { + "epoch": 0.182864, + "grad_norm": 0.86328125, + "learning_rate": 8.23741935483871e-05, + "loss": 0.1604, + "step": 11429 + }, + { + "epoch": 0.18288, + "grad_norm": 0.65234375, + "learning_rate": 8.23725806451613e-05, + "loss": 0.1558, + "step": 11430 + }, + { + "epoch": 0.182896, + "grad_norm": 0.56640625, + "learning_rate": 8.23709677419355e-05, + "loss": 0.1492, + "step": 11431 + }, + { + "epoch": 0.182912, + "grad_norm": 0.58203125, + "learning_rate": 8.236935483870968e-05, + "loss": 0.1866, + "step": 11432 + }, + { + "epoch": 0.182928, + "grad_norm": 0.78125, + "learning_rate": 8.236774193548388e-05, + "loss": 0.1744, + "step": 11433 + }, + { + "epoch": 0.182944, + "grad_norm": 0.73046875, + "learning_rate": 8.236612903225807e-05, + "loss": 0.1519, + "step": 11434 + }, + { + "epoch": 0.18296, + "grad_norm": 0.76953125, + "learning_rate": 8.236451612903227e-05, + "loss": 0.1988, + "step": 11435 + }, + { + "epoch": 0.182976, + "grad_norm": 1.0703125, + "learning_rate": 8.236290322580645e-05, + "loss": 0.1568, + "step": 11436 + }, + { + "epoch": 0.182992, + "grad_norm": 0.66796875, + "learning_rate": 8.236129032258065e-05, + "loss": 0.1644, + "step": 11437 + }, + { + "epoch": 0.183008, + "grad_norm": 1.0546875, + "learning_rate": 8.235967741935484e-05, + "loss": 0.1505, + "step": 11438 + }, + { + "epoch": 0.183024, + "grad_norm": 0.921875, + "learning_rate": 8.235806451612904e-05, + "loss": 0.1535, + "step": 11439 + }, + { + "epoch": 0.18304, + "grad_norm": 0.75, + "learning_rate": 8.235645161290322e-05, + "loss": 0.1733, + "step": 11440 + }, + { + "epoch": 0.183056, + "grad_norm": 0.83203125, + "learning_rate": 8.235483870967742e-05, + "loss": 0.1715, + "step": 11441 + }, + { + "epoch": 0.183072, + "grad_norm": 1.0234375, + "learning_rate": 8.235322580645162e-05, + "loss": 0.1889, + "step": 11442 + }, + { + "epoch": 0.183088, + "grad_norm": 1.046875, + "learning_rate": 8.235161290322581e-05, + "loss": 0.1548, + "step": 11443 + }, + { + "epoch": 0.183104, + "grad_norm": 0.5859375, + "learning_rate": 8.235000000000001e-05, + "loss": 0.1932, + "step": 11444 + }, + { + "epoch": 0.18312, + "grad_norm": 0.88671875, + "learning_rate": 8.23483870967742e-05, + "loss": 0.1822, + "step": 11445 + }, + { + "epoch": 0.183136, + "grad_norm": 0.83984375, + "learning_rate": 8.23467741935484e-05, + "loss": 0.1725, + "step": 11446 + }, + { + "epoch": 0.183152, + "grad_norm": 0.6015625, + "learning_rate": 8.234516129032258e-05, + "loss": 0.1849, + "step": 11447 + }, + { + "epoch": 0.183168, + "grad_norm": 1.0703125, + "learning_rate": 8.234354838709678e-05, + "loss": 0.2292, + "step": 11448 + }, + { + "epoch": 0.183184, + "grad_norm": 1.6328125, + "learning_rate": 8.234193548387097e-05, + "loss": 0.193, + "step": 11449 + }, + { + "epoch": 0.1832, + "grad_norm": 0.6484375, + "learning_rate": 8.234032258064517e-05, + "loss": 0.1588, + "step": 11450 + }, + { + "epoch": 0.183216, + "grad_norm": 0.7421875, + "learning_rate": 8.233870967741935e-05, + "loss": 0.1612, + "step": 11451 + }, + { + "epoch": 0.183232, + "grad_norm": 1.484375, + "learning_rate": 8.233709677419355e-05, + "loss": 0.1529, + "step": 11452 + }, + { + "epoch": 0.183248, + "grad_norm": 0.93359375, + "learning_rate": 8.233548387096774e-05, + "loss": 0.1669, + "step": 11453 + }, + { + "epoch": 0.183264, + "grad_norm": 0.78125, + "learning_rate": 8.233387096774194e-05, + "loss": 0.1786, + "step": 11454 + }, + { + "epoch": 0.18328, + "grad_norm": 1.1953125, + "learning_rate": 8.233225806451614e-05, + "loss": 0.1636, + "step": 11455 + }, + { + "epoch": 0.183296, + "grad_norm": 1.0546875, + "learning_rate": 8.233064516129034e-05, + "loss": 0.206, + "step": 11456 + }, + { + "epoch": 0.183312, + "grad_norm": 1.03125, + "learning_rate": 8.232903225806452e-05, + "loss": 0.1906, + "step": 11457 + }, + { + "epoch": 0.183328, + "grad_norm": 0.7421875, + "learning_rate": 8.232741935483871e-05, + "loss": 0.2209, + "step": 11458 + }, + { + "epoch": 0.183344, + "grad_norm": 0.59765625, + "learning_rate": 8.232580645161291e-05, + "loss": 0.1387, + "step": 11459 + }, + { + "epoch": 0.18336, + "grad_norm": 0.82421875, + "learning_rate": 8.23241935483871e-05, + "loss": 0.1758, + "step": 11460 + }, + { + "epoch": 0.183376, + "grad_norm": 0.73046875, + "learning_rate": 8.23225806451613e-05, + "loss": 0.1624, + "step": 11461 + }, + { + "epoch": 0.183392, + "grad_norm": 0.796875, + "learning_rate": 8.232096774193548e-05, + "loss": 0.1645, + "step": 11462 + }, + { + "epoch": 0.183408, + "grad_norm": 0.80078125, + "learning_rate": 8.231935483870968e-05, + "loss": 0.1608, + "step": 11463 + }, + { + "epoch": 0.183424, + "grad_norm": 0.55078125, + "learning_rate": 8.231774193548387e-05, + "loss": 0.1612, + "step": 11464 + }, + { + "epoch": 0.18344, + "grad_norm": 0.6015625, + "learning_rate": 8.231612903225807e-05, + "loss": 0.1635, + "step": 11465 + }, + { + "epoch": 0.183456, + "grad_norm": 0.84765625, + "learning_rate": 8.231451612903227e-05, + "loss": 0.2237, + "step": 11466 + }, + { + "epoch": 0.183472, + "grad_norm": 0.6953125, + "learning_rate": 8.231290322580647e-05, + "loss": 0.1794, + "step": 11467 + }, + { + "epoch": 0.183488, + "grad_norm": 0.99609375, + "learning_rate": 8.231129032258065e-05, + "loss": 0.1436, + "step": 11468 + }, + { + "epoch": 0.183504, + "grad_norm": 0.92578125, + "learning_rate": 8.230967741935485e-05, + "loss": 0.1773, + "step": 11469 + }, + { + "epoch": 0.18352, + "grad_norm": 0.92578125, + "learning_rate": 8.230806451612904e-05, + "loss": 0.1721, + "step": 11470 + }, + { + "epoch": 0.183536, + "grad_norm": 0.6796875, + "learning_rate": 8.230645161290324e-05, + "loss": 0.1644, + "step": 11471 + }, + { + "epoch": 0.183552, + "grad_norm": 0.5703125, + "learning_rate": 8.230483870967742e-05, + "loss": 0.1324, + "step": 11472 + }, + { + "epoch": 0.183568, + "grad_norm": 0.99609375, + "learning_rate": 8.230322580645161e-05, + "loss": 0.1671, + "step": 11473 + }, + { + "epoch": 0.183584, + "grad_norm": 0.7109375, + "learning_rate": 8.230161290322581e-05, + "loss": 0.1807, + "step": 11474 + }, + { + "epoch": 0.1836, + "grad_norm": 0.70703125, + "learning_rate": 8.23e-05, + "loss": 0.1679, + "step": 11475 + }, + { + "epoch": 0.183616, + "grad_norm": 0.73046875, + "learning_rate": 8.22983870967742e-05, + "loss": 0.2303, + "step": 11476 + }, + { + "epoch": 0.183632, + "grad_norm": 0.90625, + "learning_rate": 8.229677419354838e-05, + "loss": 0.1878, + "step": 11477 + }, + { + "epoch": 0.183648, + "grad_norm": 0.59375, + "learning_rate": 8.229516129032258e-05, + "loss": 0.1563, + "step": 11478 + }, + { + "epoch": 0.183664, + "grad_norm": 0.63671875, + "learning_rate": 8.229354838709678e-05, + "loss": 0.1425, + "step": 11479 + }, + { + "epoch": 0.18368, + "grad_norm": 0.9140625, + "learning_rate": 8.229193548387098e-05, + "loss": 0.2111, + "step": 11480 + }, + { + "epoch": 0.183696, + "grad_norm": 0.875, + "learning_rate": 8.229032258064517e-05, + "loss": 0.1678, + "step": 11481 + }, + { + "epoch": 0.183712, + "grad_norm": 0.96875, + "learning_rate": 8.228870967741936e-05, + "loss": 0.1748, + "step": 11482 + }, + { + "epoch": 0.183728, + "grad_norm": 0.67578125, + "learning_rate": 8.228709677419355e-05, + "loss": 0.1539, + "step": 11483 + }, + { + "epoch": 0.183744, + "grad_norm": 0.8828125, + "learning_rate": 8.228548387096775e-05, + "loss": 0.1817, + "step": 11484 + }, + { + "epoch": 0.18376, + "grad_norm": 1.4296875, + "learning_rate": 8.228387096774194e-05, + "loss": 0.1625, + "step": 11485 + }, + { + "epoch": 0.183776, + "grad_norm": 0.73046875, + "learning_rate": 8.228225806451614e-05, + "loss": 0.1967, + "step": 11486 + }, + { + "epoch": 0.183792, + "grad_norm": 0.80859375, + "learning_rate": 8.228064516129032e-05, + "loss": 0.1486, + "step": 11487 + }, + { + "epoch": 0.183808, + "grad_norm": 0.76953125, + "learning_rate": 8.227903225806451e-05, + "loss": 0.1673, + "step": 11488 + }, + { + "epoch": 0.183824, + "grad_norm": 1.0078125, + "learning_rate": 8.227741935483871e-05, + "loss": 0.1775, + "step": 11489 + }, + { + "epoch": 0.18384, + "grad_norm": 0.9921875, + "learning_rate": 8.227580645161291e-05, + "loss": 0.1777, + "step": 11490 + }, + { + "epoch": 0.183856, + "grad_norm": 0.734375, + "learning_rate": 8.227419354838711e-05, + "loss": 0.1831, + "step": 11491 + }, + { + "epoch": 0.183872, + "grad_norm": 1.7734375, + "learning_rate": 8.22725806451613e-05, + "loss": 0.1857, + "step": 11492 + }, + { + "epoch": 0.183888, + "grad_norm": 1.1796875, + "learning_rate": 8.227096774193549e-05, + "loss": 0.1977, + "step": 11493 + }, + { + "epoch": 0.183904, + "grad_norm": 1.171875, + "learning_rate": 8.226935483870968e-05, + "loss": 0.225, + "step": 11494 + }, + { + "epoch": 0.18392, + "grad_norm": 0.85546875, + "learning_rate": 8.226774193548388e-05, + "loss": 0.1718, + "step": 11495 + }, + { + "epoch": 0.183936, + "grad_norm": 0.75390625, + "learning_rate": 8.226612903225806e-05, + "loss": 0.1453, + "step": 11496 + }, + { + "epoch": 0.183952, + "grad_norm": 0.7734375, + "learning_rate": 8.226451612903226e-05, + "loss": 0.19, + "step": 11497 + }, + { + "epoch": 0.183968, + "grad_norm": 1.1171875, + "learning_rate": 8.226290322580645e-05, + "loss": 0.1487, + "step": 11498 + }, + { + "epoch": 0.183984, + "grad_norm": 0.640625, + "learning_rate": 8.226129032258065e-05, + "loss": 0.1743, + "step": 11499 + }, + { + "epoch": 0.184, + "grad_norm": 0.70703125, + "learning_rate": 8.225967741935484e-05, + "loss": 0.1531, + "step": 11500 + }, + { + "epoch": 0.184016, + "grad_norm": 1.1640625, + "learning_rate": 8.225806451612904e-05, + "loss": 0.1703, + "step": 11501 + }, + { + "epoch": 0.184032, + "grad_norm": 0.98046875, + "learning_rate": 8.225645161290324e-05, + "loss": 0.193, + "step": 11502 + }, + { + "epoch": 0.184048, + "grad_norm": 1.3984375, + "learning_rate": 8.225483870967744e-05, + "loss": 0.1647, + "step": 11503 + }, + { + "epoch": 0.184064, + "grad_norm": 0.58203125, + "learning_rate": 8.225322580645162e-05, + "loss": 0.1585, + "step": 11504 + }, + { + "epoch": 0.18408, + "grad_norm": 1.1640625, + "learning_rate": 8.225161290322581e-05, + "loss": 0.2334, + "step": 11505 + }, + { + "epoch": 0.184096, + "grad_norm": 1.09375, + "learning_rate": 8.225000000000001e-05, + "loss": 0.173, + "step": 11506 + }, + { + "epoch": 0.184112, + "grad_norm": 1.140625, + "learning_rate": 8.224838709677419e-05, + "loss": 0.1702, + "step": 11507 + }, + { + "epoch": 0.184128, + "grad_norm": 0.86328125, + "learning_rate": 8.224677419354839e-05, + "loss": 0.1641, + "step": 11508 + }, + { + "epoch": 0.184144, + "grad_norm": 1.28125, + "learning_rate": 8.224516129032258e-05, + "loss": 0.186, + "step": 11509 + }, + { + "epoch": 0.18416, + "grad_norm": 0.671875, + "learning_rate": 8.224354838709678e-05, + "loss": 0.1466, + "step": 11510 + }, + { + "epoch": 0.184176, + "grad_norm": 0.74609375, + "learning_rate": 8.224193548387096e-05, + "loss": 0.1655, + "step": 11511 + }, + { + "epoch": 0.184192, + "grad_norm": 0.91796875, + "learning_rate": 8.224032258064516e-05, + "loss": 0.2207, + "step": 11512 + }, + { + "epoch": 0.184208, + "grad_norm": 0.9453125, + "learning_rate": 8.223870967741935e-05, + "loss": 0.1761, + "step": 11513 + }, + { + "epoch": 0.184224, + "grad_norm": 1.265625, + "learning_rate": 8.223709677419355e-05, + "loss": 0.1406, + "step": 11514 + }, + { + "epoch": 0.18424, + "grad_norm": 1.734375, + "learning_rate": 8.223548387096775e-05, + "loss": 0.2458, + "step": 11515 + }, + { + "epoch": 0.184256, + "grad_norm": 0.76953125, + "learning_rate": 8.223387096774195e-05, + "loss": 0.1619, + "step": 11516 + }, + { + "epoch": 0.184272, + "grad_norm": 1.40625, + "learning_rate": 8.223225806451614e-05, + "loss": 0.1495, + "step": 11517 + }, + { + "epoch": 0.184288, + "grad_norm": 1.3125, + "learning_rate": 8.223064516129033e-05, + "loss": 0.1833, + "step": 11518 + }, + { + "epoch": 0.184304, + "grad_norm": 0.640625, + "learning_rate": 8.222903225806452e-05, + "loss": 0.142, + "step": 11519 + }, + { + "epoch": 0.18432, + "grad_norm": 0.77734375, + "learning_rate": 8.222741935483871e-05, + "loss": 0.1955, + "step": 11520 + }, + { + "epoch": 0.184336, + "grad_norm": 0.84765625, + "learning_rate": 8.22258064516129e-05, + "loss": 0.1663, + "step": 11521 + }, + { + "epoch": 0.184352, + "grad_norm": 0.99609375, + "learning_rate": 8.222419354838709e-05, + "loss": 0.1693, + "step": 11522 + }, + { + "epoch": 0.184368, + "grad_norm": 0.7265625, + "learning_rate": 8.222258064516129e-05, + "loss": 0.1358, + "step": 11523 + }, + { + "epoch": 0.184384, + "grad_norm": 0.423828125, + "learning_rate": 8.222096774193548e-05, + "loss": 0.1178, + "step": 11524 + }, + { + "epoch": 0.1844, + "grad_norm": 1.4140625, + "learning_rate": 8.221935483870968e-05, + "loss": 0.1666, + "step": 11525 + }, + { + "epoch": 0.184416, + "grad_norm": 0.83984375, + "learning_rate": 8.221774193548388e-05, + "loss": 0.1578, + "step": 11526 + }, + { + "epoch": 0.184432, + "grad_norm": 1.0390625, + "learning_rate": 8.221612903225808e-05, + "loss": 0.2036, + "step": 11527 + }, + { + "epoch": 0.184448, + "grad_norm": 0.73828125, + "learning_rate": 8.221451612903226e-05, + "loss": 0.1787, + "step": 11528 + }, + { + "epoch": 0.184464, + "grad_norm": 0.76953125, + "learning_rate": 8.221290322580646e-05, + "loss": 0.1571, + "step": 11529 + }, + { + "epoch": 0.18448, + "grad_norm": 0.7890625, + "learning_rate": 8.221129032258065e-05, + "loss": 0.1732, + "step": 11530 + }, + { + "epoch": 0.184496, + "grad_norm": 0.67578125, + "learning_rate": 8.220967741935485e-05, + "loss": 0.1427, + "step": 11531 + }, + { + "epoch": 0.184512, + "grad_norm": 0.98828125, + "learning_rate": 8.220806451612903e-05, + "loss": 0.2222, + "step": 11532 + }, + { + "epoch": 0.184528, + "grad_norm": 0.69921875, + "learning_rate": 8.220645161290323e-05, + "loss": 0.169, + "step": 11533 + }, + { + "epoch": 0.184544, + "grad_norm": 0.7734375, + "learning_rate": 8.220483870967742e-05, + "loss": 0.1361, + "step": 11534 + }, + { + "epoch": 0.18456, + "grad_norm": 0.7265625, + "learning_rate": 8.22032258064516e-05, + "loss": 0.177, + "step": 11535 + }, + { + "epoch": 0.184576, + "grad_norm": 0.54296875, + "learning_rate": 8.22016129032258e-05, + "loss": 0.1212, + "step": 11536 + }, + { + "epoch": 0.184592, + "grad_norm": 1.0625, + "learning_rate": 8.22e-05, + "loss": 0.1983, + "step": 11537 + }, + { + "epoch": 0.184608, + "grad_norm": 0.62109375, + "learning_rate": 8.21983870967742e-05, + "loss": 0.1647, + "step": 11538 + }, + { + "epoch": 0.184624, + "grad_norm": 0.53125, + "learning_rate": 8.219677419354839e-05, + "loss": 0.1433, + "step": 11539 + }, + { + "epoch": 0.18464, + "grad_norm": 0.69921875, + "learning_rate": 8.219516129032259e-05, + "loss": 0.1839, + "step": 11540 + }, + { + "epoch": 0.184656, + "grad_norm": 0.49609375, + "learning_rate": 8.219354838709678e-05, + "loss": 0.1259, + "step": 11541 + }, + { + "epoch": 0.184672, + "grad_norm": 0.6953125, + "learning_rate": 8.219193548387098e-05, + "loss": 0.154, + "step": 11542 + }, + { + "epoch": 0.184688, + "grad_norm": 0.578125, + "learning_rate": 8.219032258064516e-05, + "loss": 0.1535, + "step": 11543 + }, + { + "epoch": 0.184704, + "grad_norm": 0.7890625, + "learning_rate": 8.218870967741936e-05, + "loss": 0.1512, + "step": 11544 + }, + { + "epoch": 0.18472, + "grad_norm": 1.3359375, + "learning_rate": 8.218709677419355e-05, + "loss": 0.1806, + "step": 11545 + }, + { + "epoch": 0.184736, + "grad_norm": 0.6796875, + "learning_rate": 8.218548387096775e-05, + "loss": 0.1629, + "step": 11546 + }, + { + "epoch": 0.184752, + "grad_norm": 0.7265625, + "learning_rate": 8.218387096774193e-05, + "loss": 0.1717, + "step": 11547 + }, + { + "epoch": 0.184768, + "grad_norm": 0.73828125, + "learning_rate": 8.218225806451613e-05, + "loss": 0.1668, + "step": 11548 + }, + { + "epoch": 0.184784, + "grad_norm": 0.6875, + "learning_rate": 8.218064516129032e-05, + "loss": 0.1703, + "step": 11549 + }, + { + "epoch": 0.1848, + "grad_norm": 0.59765625, + "learning_rate": 8.217903225806452e-05, + "loss": 0.133, + "step": 11550 + }, + { + "epoch": 0.184816, + "grad_norm": 0.67578125, + "learning_rate": 8.217741935483872e-05, + "loss": 0.1882, + "step": 11551 + }, + { + "epoch": 0.184832, + "grad_norm": 0.69921875, + "learning_rate": 8.21758064516129e-05, + "loss": 0.1578, + "step": 11552 + }, + { + "epoch": 0.184848, + "grad_norm": 0.94921875, + "learning_rate": 8.21741935483871e-05, + "loss": 0.1638, + "step": 11553 + }, + { + "epoch": 0.184864, + "grad_norm": 1.1171875, + "learning_rate": 8.217258064516129e-05, + "loss": 0.2546, + "step": 11554 + }, + { + "epoch": 0.18488, + "grad_norm": 0.86328125, + "learning_rate": 8.217096774193549e-05, + "loss": 0.1669, + "step": 11555 + }, + { + "epoch": 0.184896, + "grad_norm": 0.77734375, + "learning_rate": 8.216935483870968e-05, + "loss": 0.1682, + "step": 11556 + }, + { + "epoch": 0.184912, + "grad_norm": 0.66796875, + "learning_rate": 8.216774193548388e-05, + "loss": 0.1688, + "step": 11557 + }, + { + "epoch": 0.184928, + "grad_norm": 1.171875, + "learning_rate": 8.216612903225806e-05, + "loss": 0.198, + "step": 11558 + }, + { + "epoch": 0.184944, + "grad_norm": 0.78515625, + "learning_rate": 8.216451612903226e-05, + "loss": 0.1627, + "step": 11559 + }, + { + "epoch": 0.18496, + "grad_norm": 0.93359375, + "learning_rate": 8.216290322580645e-05, + "loss": 0.1515, + "step": 11560 + }, + { + "epoch": 0.184976, + "grad_norm": 1.078125, + "learning_rate": 8.216129032258065e-05, + "loss": 0.2138, + "step": 11561 + }, + { + "epoch": 0.184992, + "grad_norm": 0.61328125, + "learning_rate": 8.215967741935485e-05, + "loss": 0.1733, + "step": 11562 + }, + { + "epoch": 0.185008, + "grad_norm": 1.1171875, + "learning_rate": 8.215806451612905e-05, + "loss": 0.1973, + "step": 11563 + }, + { + "epoch": 0.185024, + "grad_norm": 0.703125, + "learning_rate": 8.215645161290323e-05, + "loss": 0.1609, + "step": 11564 + }, + { + "epoch": 0.18504, + "grad_norm": 0.96484375, + "learning_rate": 8.215483870967743e-05, + "loss": 0.155, + "step": 11565 + }, + { + "epoch": 0.185056, + "grad_norm": 0.56640625, + "learning_rate": 8.215322580645162e-05, + "loss": 0.1634, + "step": 11566 + }, + { + "epoch": 0.185072, + "grad_norm": 0.74609375, + "learning_rate": 8.21516129032258e-05, + "loss": 0.1744, + "step": 11567 + }, + { + "epoch": 0.185088, + "grad_norm": 0.74609375, + "learning_rate": 8.215e-05, + "loss": 0.1423, + "step": 11568 + }, + { + "epoch": 0.185104, + "grad_norm": 0.87109375, + "learning_rate": 8.214838709677419e-05, + "loss": 0.1547, + "step": 11569 + }, + { + "epoch": 0.18512, + "grad_norm": 0.65234375, + "learning_rate": 8.214677419354839e-05, + "loss": 0.1613, + "step": 11570 + }, + { + "epoch": 0.185136, + "grad_norm": 1.03125, + "learning_rate": 8.214516129032258e-05, + "loss": 0.1832, + "step": 11571 + }, + { + "epoch": 0.185152, + "grad_norm": 1.1328125, + "learning_rate": 8.214354838709678e-05, + "loss": 0.1695, + "step": 11572 + }, + { + "epoch": 0.185168, + "grad_norm": 0.66015625, + "learning_rate": 8.214193548387096e-05, + "loss": 0.1904, + "step": 11573 + }, + { + "epoch": 0.185184, + "grad_norm": 0.78125, + "learning_rate": 8.214032258064516e-05, + "loss": 0.1854, + "step": 11574 + }, + { + "epoch": 0.1852, + "grad_norm": 0.6796875, + "learning_rate": 8.213870967741936e-05, + "loss": 0.1521, + "step": 11575 + }, + { + "epoch": 0.185216, + "grad_norm": 0.66796875, + "learning_rate": 8.213709677419356e-05, + "loss": 0.1686, + "step": 11576 + }, + { + "epoch": 0.185232, + "grad_norm": 0.8203125, + "learning_rate": 8.213548387096775e-05, + "loss": 0.1794, + "step": 11577 + }, + { + "epoch": 0.185248, + "grad_norm": 0.66015625, + "learning_rate": 8.213387096774195e-05, + "loss": 0.1626, + "step": 11578 + }, + { + "epoch": 0.185264, + "grad_norm": 0.65234375, + "learning_rate": 8.213225806451613e-05, + "loss": 0.2098, + "step": 11579 + }, + { + "epoch": 0.18528, + "grad_norm": 0.99609375, + "learning_rate": 8.213064516129033e-05, + "loss": 0.1822, + "step": 11580 + }, + { + "epoch": 0.185296, + "grad_norm": 0.82421875, + "learning_rate": 8.212903225806452e-05, + "loss": 0.1676, + "step": 11581 + }, + { + "epoch": 0.185312, + "grad_norm": 0.92578125, + "learning_rate": 8.21274193548387e-05, + "loss": 0.1804, + "step": 11582 + }, + { + "epoch": 0.185328, + "grad_norm": 0.75, + "learning_rate": 8.21258064516129e-05, + "loss": 0.1554, + "step": 11583 + }, + { + "epoch": 0.185344, + "grad_norm": 0.80859375, + "learning_rate": 8.212419354838709e-05, + "loss": 0.1702, + "step": 11584 + }, + { + "epoch": 0.18536, + "grad_norm": 0.60546875, + "learning_rate": 8.212258064516129e-05, + "loss": 0.1372, + "step": 11585 + }, + { + "epoch": 0.185376, + "grad_norm": 0.546875, + "learning_rate": 8.212096774193549e-05, + "loss": 0.1577, + "step": 11586 + }, + { + "epoch": 0.185392, + "grad_norm": 1.375, + "learning_rate": 8.211935483870969e-05, + "loss": 0.1729, + "step": 11587 + }, + { + "epoch": 0.185408, + "grad_norm": 0.79296875, + "learning_rate": 8.211774193548388e-05, + "loss": 0.1455, + "step": 11588 + }, + { + "epoch": 0.185424, + "grad_norm": 0.80859375, + "learning_rate": 8.211612903225807e-05, + "loss": 0.1764, + "step": 11589 + }, + { + "epoch": 0.18544, + "grad_norm": 0.625, + "learning_rate": 8.211451612903226e-05, + "loss": 0.1985, + "step": 11590 + }, + { + "epoch": 0.185456, + "grad_norm": 0.8125, + "learning_rate": 8.211290322580646e-05, + "loss": 0.1916, + "step": 11591 + }, + { + "epoch": 0.185472, + "grad_norm": 0.73828125, + "learning_rate": 8.211129032258065e-05, + "loss": 0.1883, + "step": 11592 + }, + { + "epoch": 0.185488, + "grad_norm": 0.7109375, + "learning_rate": 8.210967741935485e-05, + "loss": 0.1741, + "step": 11593 + }, + { + "epoch": 0.185504, + "grad_norm": 0.58203125, + "learning_rate": 8.210806451612903e-05, + "loss": 0.1868, + "step": 11594 + }, + { + "epoch": 0.18552, + "grad_norm": 0.8125, + "learning_rate": 8.210645161290323e-05, + "loss": 0.2097, + "step": 11595 + }, + { + "epoch": 0.185536, + "grad_norm": 0.9140625, + "learning_rate": 8.210483870967742e-05, + "loss": 0.2115, + "step": 11596 + }, + { + "epoch": 0.185552, + "grad_norm": 0.828125, + "learning_rate": 8.210322580645162e-05, + "loss": 0.1712, + "step": 11597 + }, + { + "epoch": 0.185568, + "grad_norm": 0.71484375, + "learning_rate": 8.210161290322582e-05, + "loss": 0.1616, + "step": 11598 + }, + { + "epoch": 0.185584, + "grad_norm": 0.83984375, + "learning_rate": 8.21e-05, + "loss": 0.1548, + "step": 11599 + }, + { + "epoch": 0.1856, + "grad_norm": 0.82421875, + "learning_rate": 8.20983870967742e-05, + "loss": 0.1743, + "step": 11600 + }, + { + "epoch": 0.185616, + "grad_norm": 0.6171875, + "learning_rate": 8.209677419354839e-05, + "loss": 0.1529, + "step": 11601 + }, + { + "epoch": 0.185632, + "grad_norm": 1.3203125, + "learning_rate": 8.209516129032259e-05, + "loss": 0.2104, + "step": 11602 + }, + { + "epoch": 0.185648, + "grad_norm": 0.65234375, + "learning_rate": 8.209354838709677e-05, + "loss": 0.1844, + "step": 11603 + }, + { + "epoch": 0.185664, + "grad_norm": 1.0234375, + "learning_rate": 8.209193548387097e-05, + "loss": 0.1625, + "step": 11604 + }, + { + "epoch": 0.18568, + "grad_norm": 0.77734375, + "learning_rate": 8.209032258064516e-05, + "loss": 0.1856, + "step": 11605 + }, + { + "epoch": 0.185696, + "grad_norm": 0.8046875, + "learning_rate": 8.208870967741936e-05, + "loss": 0.1421, + "step": 11606 + }, + { + "epoch": 0.185712, + "grad_norm": 0.703125, + "learning_rate": 8.208709677419355e-05, + "loss": 0.1959, + "step": 11607 + }, + { + "epoch": 0.185728, + "grad_norm": 1.0546875, + "learning_rate": 8.208548387096775e-05, + "loss": 0.2039, + "step": 11608 + }, + { + "epoch": 0.185744, + "grad_norm": 0.7734375, + "learning_rate": 8.208387096774193e-05, + "loss": 0.1697, + "step": 11609 + }, + { + "epoch": 0.18576, + "grad_norm": 0.68359375, + "learning_rate": 8.208225806451613e-05, + "loss": 0.1682, + "step": 11610 + }, + { + "epoch": 0.185776, + "grad_norm": 1.0078125, + "learning_rate": 8.208064516129033e-05, + "loss": 0.2089, + "step": 11611 + }, + { + "epoch": 0.185792, + "grad_norm": 0.78515625, + "learning_rate": 8.207903225806453e-05, + "loss": 0.2176, + "step": 11612 + }, + { + "epoch": 0.185808, + "grad_norm": 0.73046875, + "learning_rate": 8.207741935483872e-05, + "loss": 0.1667, + "step": 11613 + }, + { + "epoch": 0.185824, + "grad_norm": 0.88671875, + "learning_rate": 8.20758064516129e-05, + "loss": 0.1936, + "step": 11614 + }, + { + "epoch": 0.18584, + "grad_norm": 0.5546875, + "learning_rate": 8.20741935483871e-05, + "loss": 0.143, + "step": 11615 + }, + { + "epoch": 0.185856, + "grad_norm": 1.0234375, + "learning_rate": 8.207258064516129e-05, + "loss": 0.1762, + "step": 11616 + }, + { + "epoch": 0.185872, + "grad_norm": 1.234375, + "learning_rate": 8.207096774193549e-05, + "loss": 0.2172, + "step": 11617 + }, + { + "epoch": 0.185888, + "grad_norm": 1.4296875, + "learning_rate": 8.206935483870967e-05, + "loss": 0.1581, + "step": 11618 + }, + { + "epoch": 0.185904, + "grad_norm": 0.9609375, + "learning_rate": 8.206774193548387e-05, + "loss": 0.1741, + "step": 11619 + }, + { + "epoch": 0.18592, + "grad_norm": 0.75390625, + "learning_rate": 8.206612903225806e-05, + "loss": 0.1567, + "step": 11620 + }, + { + "epoch": 0.185936, + "grad_norm": 0.68359375, + "learning_rate": 8.206451612903226e-05, + "loss": 0.1237, + "step": 11621 + }, + { + "epoch": 0.185952, + "grad_norm": 0.65234375, + "learning_rate": 8.206290322580646e-05, + "loss": 0.165, + "step": 11622 + }, + { + "epoch": 0.185968, + "grad_norm": 0.77734375, + "learning_rate": 8.206129032258066e-05, + "loss": 0.1642, + "step": 11623 + }, + { + "epoch": 0.185984, + "grad_norm": 0.82421875, + "learning_rate": 8.205967741935485e-05, + "loss": 0.1941, + "step": 11624 + }, + { + "epoch": 0.186, + "grad_norm": 0.640625, + "learning_rate": 8.205806451612904e-05, + "loss": 0.1424, + "step": 11625 + }, + { + "epoch": 0.186016, + "grad_norm": 0.67578125, + "learning_rate": 8.205645161290323e-05, + "loss": 0.1857, + "step": 11626 + }, + { + "epoch": 0.186032, + "grad_norm": 0.76171875, + "learning_rate": 8.205483870967743e-05, + "loss": 0.2087, + "step": 11627 + }, + { + "epoch": 0.186048, + "grad_norm": 0.77734375, + "learning_rate": 8.205322580645162e-05, + "loss": 0.1649, + "step": 11628 + }, + { + "epoch": 0.186064, + "grad_norm": 0.85546875, + "learning_rate": 8.20516129032258e-05, + "loss": 0.1612, + "step": 11629 + }, + { + "epoch": 0.18608, + "grad_norm": 0.8984375, + "learning_rate": 8.205e-05, + "loss": 0.21, + "step": 11630 + }, + { + "epoch": 0.186096, + "grad_norm": 0.921875, + "learning_rate": 8.204838709677419e-05, + "loss": 0.1546, + "step": 11631 + }, + { + "epoch": 0.186112, + "grad_norm": 1.0546875, + "learning_rate": 8.204677419354839e-05, + "loss": 0.1803, + "step": 11632 + }, + { + "epoch": 0.186128, + "grad_norm": 0.8671875, + "learning_rate": 8.204516129032259e-05, + "loss": 0.1736, + "step": 11633 + }, + { + "epoch": 0.186144, + "grad_norm": 1.0234375, + "learning_rate": 8.204354838709677e-05, + "loss": 0.1804, + "step": 11634 + }, + { + "epoch": 0.18616, + "grad_norm": 0.7109375, + "learning_rate": 8.204193548387097e-05, + "loss": 0.1376, + "step": 11635 + }, + { + "epoch": 0.186176, + "grad_norm": 0.58984375, + "learning_rate": 8.204032258064517e-05, + "loss": 0.1532, + "step": 11636 + }, + { + "epoch": 0.186192, + "grad_norm": 1.40625, + "learning_rate": 8.203870967741936e-05, + "loss": 0.2074, + "step": 11637 + }, + { + "epoch": 0.186208, + "grad_norm": 0.96875, + "learning_rate": 8.203709677419356e-05, + "loss": 0.1985, + "step": 11638 + }, + { + "epoch": 0.186224, + "grad_norm": 0.76953125, + "learning_rate": 8.203548387096774e-05, + "loss": 0.1481, + "step": 11639 + }, + { + "epoch": 0.18624, + "grad_norm": 0.76171875, + "learning_rate": 8.203387096774194e-05, + "loss": 0.1739, + "step": 11640 + }, + { + "epoch": 0.186256, + "grad_norm": 1.0, + "learning_rate": 8.203225806451613e-05, + "loss": 0.1629, + "step": 11641 + }, + { + "epoch": 0.186272, + "grad_norm": 0.99609375, + "learning_rate": 8.203064516129033e-05, + "loss": 0.1993, + "step": 11642 + }, + { + "epoch": 0.186288, + "grad_norm": 1.34375, + "learning_rate": 8.202903225806452e-05, + "loss": 0.2134, + "step": 11643 + }, + { + "epoch": 0.186304, + "grad_norm": 0.75, + "learning_rate": 8.20274193548387e-05, + "loss": 0.1293, + "step": 11644 + }, + { + "epoch": 0.18632, + "grad_norm": 0.61328125, + "learning_rate": 8.20258064516129e-05, + "loss": 0.1387, + "step": 11645 + }, + { + "epoch": 0.186336, + "grad_norm": 0.73046875, + "learning_rate": 8.20241935483871e-05, + "loss": 0.2035, + "step": 11646 + }, + { + "epoch": 0.186352, + "grad_norm": 0.7265625, + "learning_rate": 8.20225806451613e-05, + "loss": 0.1545, + "step": 11647 + }, + { + "epoch": 0.186368, + "grad_norm": 0.625, + "learning_rate": 8.202096774193549e-05, + "loss": 0.1483, + "step": 11648 + }, + { + "epoch": 0.186384, + "grad_norm": 1.0234375, + "learning_rate": 8.201935483870969e-05, + "loss": 0.18, + "step": 11649 + }, + { + "epoch": 0.1864, + "grad_norm": 0.73828125, + "learning_rate": 8.201774193548387e-05, + "loss": 0.157, + "step": 11650 + }, + { + "epoch": 0.186416, + "grad_norm": 0.80859375, + "learning_rate": 8.201612903225807e-05, + "loss": 0.1573, + "step": 11651 + }, + { + "epoch": 0.186432, + "grad_norm": 0.76171875, + "learning_rate": 8.201451612903226e-05, + "loss": 0.1672, + "step": 11652 + }, + { + "epoch": 0.186448, + "grad_norm": 0.703125, + "learning_rate": 8.201290322580646e-05, + "loss": 0.1457, + "step": 11653 + }, + { + "epoch": 0.186464, + "grad_norm": 0.65625, + "learning_rate": 8.201129032258064e-05, + "loss": 0.1229, + "step": 11654 + }, + { + "epoch": 0.18648, + "grad_norm": 0.71484375, + "learning_rate": 8.200967741935484e-05, + "loss": 0.1674, + "step": 11655 + }, + { + "epoch": 0.186496, + "grad_norm": 0.94921875, + "learning_rate": 8.200806451612903e-05, + "loss": 0.1675, + "step": 11656 + }, + { + "epoch": 0.186512, + "grad_norm": 0.74609375, + "learning_rate": 8.200645161290323e-05, + "loss": 0.1831, + "step": 11657 + }, + { + "epoch": 0.186528, + "grad_norm": 0.69140625, + "learning_rate": 8.200483870967743e-05, + "loss": 0.1677, + "step": 11658 + }, + { + "epoch": 0.186544, + "grad_norm": 0.86328125, + "learning_rate": 8.200322580645163e-05, + "loss": 0.1918, + "step": 11659 + }, + { + "epoch": 0.18656, + "grad_norm": 1.1328125, + "learning_rate": 8.200161290322581e-05, + "loss": 0.1648, + "step": 11660 + }, + { + "epoch": 0.186576, + "grad_norm": 0.796875, + "learning_rate": 8.2e-05, + "loss": 0.1634, + "step": 11661 + }, + { + "epoch": 0.186592, + "grad_norm": 0.79296875, + "learning_rate": 8.19983870967742e-05, + "loss": 0.1748, + "step": 11662 + }, + { + "epoch": 0.186608, + "grad_norm": 0.79296875, + "learning_rate": 8.199677419354839e-05, + "loss": 0.1899, + "step": 11663 + }, + { + "epoch": 0.186624, + "grad_norm": 0.87109375, + "learning_rate": 8.199516129032259e-05, + "loss": 0.136, + "step": 11664 + }, + { + "epoch": 0.18664, + "grad_norm": 0.6640625, + "learning_rate": 8.199354838709677e-05, + "loss": 0.1825, + "step": 11665 + }, + { + "epoch": 0.186656, + "grad_norm": 0.79296875, + "learning_rate": 8.199193548387097e-05, + "loss": 0.1611, + "step": 11666 + }, + { + "epoch": 0.186672, + "grad_norm": 0.56640625, + "learning_rate": 8.199032258064516e-05, + "loss": 0.1688, + "step": 11667 + }, + { + "epoch": 0.186688, + "grad_norm": 0.890625, + "learning_rate": 8.198870967741936e-05, + "loss": 0.183, + "step": 11668 + }, + { + "epoch": 0.186704, + "grad_norm": 0.65625, + "learning_rate": 8.198709677419354e-05, + "loss": 0.1768, + "step": 11669 + }, + { + "epoch": 0.18672, + "grad_norm": 0.9296875, + "learning_rate": 8.198548387096774e-05, + "loss": 0.2067, + "step": 11670 + }, + { + "epoch": 0.186736, + "grad_norm": 0.73046875, + "learning_rate": 8.198387096774194e-05, + "loss": 0.1728, + "step": 11671 + }, + { + "epoch": 0.186752, + "grad_norm": 0.65234375, + "learning_rate": 8.198225806451614e-05, + "loss": 0.1603, + "step": 11672 + }, + { + "epoch": 0.186768, + "grad_norm": 0.6953125, + "learning_rate": 8.198064516129033e-05, + "loss": 0.1483, + "step": 11673 + }, + { + "epoch": 0.186784, + "grad_norm": 0.97265625, + "learning_rate": 8.197903225806453e-05, + "loss": 0.2117, + "step": 11674 + }, + { + "epoch": 0.1868, + "grad_norm": 0.77734375, + "learning_rate": 8.197741935483871e-05, + "loss": 0.1893, + "step": 11675 + }, + { + "epoch": 0.186816, + "grad_norm": 0.84375, + "learning_rate": 8.19758064516129e-05, + "loss": 0.2035, + "step": 11676 + }, + { + "epoch": 0.186832, + "grad_norm": 1.015625, + "learning_rate": 8.19741935483871e-05, + "loss": 0.1636, + "step": 11677 + }, + { + "epoch": 0.186848, + "grad_norm": 1.2890625, + "learning_rate": 8.197258064516129e-05, + "loss": 0.1791, + "step": 11678 + }, + { + "epoch": 0.186864, + "grad_norm": 1.078125, + "learning_rate": 8.197096774193549e-05, + "loss": 0.1845, + "step": 11679 + }, + { + "epoch": 0.18688, + "grad_norm": 0.5625, + "learning_rate": 8.196935483870967e-05, + "loss": 0.1476, + "step": 11680 + }, + { + "epoch": 0.186896, + "grad_norm": 0.6953125, + "learning_rate": 8.196774193548387e-05, + "loss": 0.1708, + "step": 11681 + }, + { + "epoch": 0.186912, + "grad_norm": 0.984375, + "learning_rate": 8.196612903225807e-05, + "loss": 0.1857, + "step": 11682 + }, + { + "epoch": 0.186928, + "grad_norm": 0.55078125, + "learning_rate": 8.196451612903227e-05, + "loss": 0.1419, + "step": 11683 + }, + { + "epoch": 0.186944, + "grad_norm": 0.546875, + "learning_rate": 8.196290322580646e-05, + "loss": 0.1359, + "step": 11684 + }, + { + "epoch": 0.18696, + "grad_norm": 0.6953125, + "learning_rate": 8.196129032258066e-05, + "loss": 0.1957, + "step": 11685 + }, + { + "epoch": 0.186976, + "grad_norm": 0.76171875, + "learning_rate": 8.195967741935484e-05, + "loss": 0.1399, + "step": 11686 + }, + { + "epoch": 0.186992, + "grad_norm": 0.83984375, + "learning_rate": 8.195806451612904e-05, + "loss": 0.1828, + "step": 11687 + }, + { + "epoch": 0.187008, + "grad_norm": 1.046875, + "learning_rate": 8.195645161290323e-05, + "loss": 0.1813, + "step": 11688 + }, + { + "epoch": 0.187024, + "grad_norm": 0.640625, + "learning_rate": 8.195483870967743e-05, + "loss": 0.1547, + "step": 11689 + }, + { + "epoch": 0.18704, + "grad_norm": 0.80859375, + "learning_rate": 8.195322580645161e-05, + "loss": 0.1792, + "step": 11690 + }, + { + "epoch": 0.187056, + "grad_norm": 0.69921875, + "learning_rate": 8.19516129032258e-05, + "loss": 0.2117, + "step": 11691 + }, + { + "epoch": 0.187072, + "grad_norm": 0.6484375, + "learning_rate": 8.195e-05, + "loss": 0.1753, + "step": 11692 + }, + { + "epoch": 0.187088, + "grad_norm": 1.0390625, + "learning_rate": 8.19483870967742e-05, + "loss": 0.1603, + "step": 11693 + }, + { + "epoch": 0.187104, + "grad_norm": 0.63671875, + "learning_rate": 8.19467741935484e-05, + "loss": 0.1603, + "step": 11694 + }, + { + "epoch": 0.18712, + "grad_norm": 0.59765625, + "learning_rate": 8.194516129032259e-05, + "loss": 0.1848, + "step": 11695 + }, + { + "epoch": 0.187136, + "grad_norm": 0.71875, + "learning_rate": 8.194354838709678e-05, + "loss": 0.1654, + "step": 11696 + }, + { + "epoch": 0.187152, + "grad_norm": 0.8203125, + "learning_rate": 8.194193548387097e-05, + "loss": 0.2095, + "step": 11697 + }, + { + "epoch": 0.187168, + "grad_norm": 0.890625, + "learning_rate": 8.194032258064517e-05, + "loss": 0.1998, + "step": 11698 + }, + { + "epoch": 0.187184, + "grad_norm": 1.15625, + "learning_rate": 8.193870967741936e-05, + "loss": 0.231, + "step": 11699 + }, + { + "epoch": 0.1872, + "grad_norm": 0.75, + "learning_rate": 8.193709677419356e-05, + "loss": 0.1752, + "step": 11700 + }, + { + "epoch": 0.187216, + "grad_norm": 0.6953125, + "learning_rate": 8.193548387096774e-05, + "loss": 0.1803, + "step": 11701 + }, + { + "epoch": 0.187232, + "grad_norm": 1.359375, + "learning_rate": 8.193387096774194e-05, + "loss": 0.2039, + "step": 11702 + }, + { + "epoch": 0.187248, + "grad_norm": 0.73828125, + "learning_rate": 8.193225806451613e-05, + "loss": 0.1623, + "step": 11703 + }, + { + "epoch": 0.187264, + "grad_norm": 1.453125, + "learning_rate": 8.193064516129033e-05, + "loss": 0.1541, + "step": 11704 + }, + { + "epoch": 0.18728, + "grad_norm": 1.359375, + "learning_rate": 8.192903225806451e-05, + "loss": 0.1754, + "step": 11705 + }, + { + "epoch": 0.187296, + "grad_norm": 0.97265625, + "learning_rate": 8.192741935483871e-05, + "loss": 0.1374, + "step": 11706 + }, + { + "epoch": 0.187312, + "grad_norm": 0.8984375, + "learning_rate": 8.192580645161291e-05, + "loss": 0.1472, + "step": 11707 + }, + { + "epoch": 0.187328, + "grad_norm": 0.8203125, + "learning_rate": 8.19241935483871e-05, + "loss": 0.1781, + "step": 11708 + }, + { + "epoch": 0.187344, + "grad_norm": 1.3046875, + "learning_rate": 8.19225806451613e-05, + "loss": 0.195, + "step": 11709 + }, + { + "epoch": 0.18736, + "grad_norm": 0.8828125, + "learning_rate": 8.192096774193548e-05, + "loss": 0.179, + "step": 11710 + }, + { + "epoch": 0.187376, + "grad_norm": 0.85546875, + "learning_rate": 8.191935483870968e-05, + "loss": 0.1549, + "step": 11711 + }, + { + "epoch": 0.187392, + "grad_norm": 0.65625, + "learning_rate": 8.191774193548387e-05, + "loss": 0.194, + "step": 11712 + }, + { + "epoch": 0.187408, + "grad_norm": 0.8203125, + "learning_rate": 8.191612903225807e-05, + "loss": 0.1853, + "step": 11713 + }, + { + "epoch": 0.187424, + "grad_norm": 0.83984375, + "learning_rate": 8.191451612903226e-05, + "loss": 0.2266, + "step": 11714 + }, + { + "epoch": 0.18744, + "grad_norm": 0.79296875, + "learning_rate": 8.191290322580646e-05, + "loss": 0.1536, + "step": 11715 + }, + { + "epoch": 0.187456, + "grad_norm": 0.78125, + "learning_rate": 8.191129032258064e-05, + "loss": 0.1878, + "step": 11716 + }, + { + "epoch": 0.187472, + "grad_norm": 0.55859375, + "learning_rate": 8.190967741935484e-05, + "loss": 0.1377, + "step": 11717 + }, + { + "epoch": 0.187488, + "grad_norm": 0.80078125, + "learning_rate": 8.190806451612904e-05, + "loss": 0.2064, + "step": 11718 + }, + { + "epoch": 0.187504, + "grad_norm": 0.93359375, + "learning_rate": 8.190645161290324e-05, + "loss": 0.1775, + "step": 11719 + }, + { + "epoch": 0.18752, + "grad_norm": 0.7578125, + "learning_rate": 8.190483870967743e-05, + "loss": 0.1258, + "step": 11720 + }, + { + "epoch": 0.187536, + "grad_norm": 0.7109375, + "learning_rate": 8.190322580645163e-05, + "loss": 0.1395, + "step": 11721 + }, + { + "epoch": 0.187552, + "grad_norm": 0.7421875, + "learning_rate": 8.190161290322581e-05, + "loss": 0.186, + "step": 11722 + }, + { + "epoch": 0.187568, + "grad_norm": 1.015625, + "learning_rate": 8.19e-05, + "loss": 0.1475, + "step": 11723 + }, + { + "epoch": 0.187584, + "grad_norm": 0.67578125, + "learning_rate": 8.18983870967742e-05, + "loss": 0.1525, + "step": 11724 + }, + { + "epoch": 0.1876, + "grad_norm": 0.72265625, + "learning_rate": 8.189677419354838e-05, + "loss": 0.1505, + "step": 11725 + }, + { + "epoch": 0.187616, + "grad_norm": 0.7890625, + "learning_rate": 8.189516129032258e-05, + "loss": 0.1794, + "step": 11726 + }, + { + "epoch": 0.187632, + "grad_norm": 0.58203125, + "learning_rate": 8.189354838709677e-05, + "loss": 0.1213, + "step": 11727 + }, + { + "epoch": 0.187648, + "grad_norm": 0.9296875, + "learning_rate": 8.189193548387097e-05, + "loss": 0.1905, + "step": 11728 + }, + { + "epoch": 0.187664, + "grad_norm": 0.78515625, + "learning_rate": 8.189032258064516e-05, + "loss": 0.181, + "step": 11729 + }, + { + "epoch": 0.18768, + "grad_norm": 0.80859375, + "learning_rate": 8.188870967741936e-05, + "loss": 0.1645, + "step": 11730 + }, + { + "epoch": 0.187696, + "grad_norm": 0.59765625, + "learning_rate": 8.188709677419355e-05, + "loss": 0.1501, + "step": 11731 + }, + { + "epoch": 0.187712, + "grad_norm": 1.25, + "learning_rate": 8.188548387096775e-05, + "loss": 0.187, + "step": 11732 + }, + { + "epoch": 0.187728, + "grad_norm": 1.2109375, + "learning_rate": 8.188387096774194e-05, + "loss": 0.2004, + "step": 11733 + }, + { + "epoch": 0.187744, + "grad_norm": 0.59765625, + "learning_rate": 8.188225806451614e-05, + "loss": 0.1543, + "step": 11734 + }, + { + "epoch": 0.18776, + "grad_norm": 0.54296875, + "learning_rate": 8.188064516129033e-05, + "loss": 0.1682, + "step": 11735 + }, + { + "epoch": 0.187776, + "grad_norm": 1.21875, + "learning_rate": 8.187903225806453e-05, + "loss": 0.1689, + "step": 11736 + }, + { + "epoch": 0.187792, + "grad_norm": 0.578125, + "learning_rate": 8.187741935483871e-05, + "loss": 0.1619, + "step": 11737 + }, + { + "epoch": 0.187808, + "grad_norm": 0.62109375, + "learning_rate": 8.18758064516129e-05, + "loss": 0.178, + "step": 11738 + }, + { + "epoch": 0.187824, + "grad_norm": 0.7578125, + "learning_rate": 8.18741935483871e-05, + "loss": 0.1647, + "step": 11739 + }, + { + "epoch": 0.18784, + "grad_norm": 0.890625, + "learning_rate": 8.187258064516128e-05, + "loss": 0.193, + "step": 11740 + }, + { + "epoch": 0.187856, + "grad_norm": 0.703125, + "learning_rate": 8.187096774193548e-05, + "loss": 0.1792, + "step": 11741 + }, + { + "epoch": 0.187872, + "grad_norm": 0.73046875, + "learning_rate": 8.186935483870968e-05, + "loss": 0.1995, + "step": 11742 + }, + { + "epoch": 0.187888, + "grad_norm": 1.03125, + "learning_rate": 8.186774193548388e-05, + "loss": 0.1553, + "step": 11743 + }, + { + "epoch": 0.187904, + "grad_norm": 0.796875, + "learning_rate": 8.186612903225807e-05, + "loss": 0.1789, + "step": 11744 + }, + { + "epoch": 0.18792, + "grad_norm": 0.7421875, + "learning_rate": 8.186451612903227e-05, + "loss": 0.1995, + "step": 11745 + }, + { + "epoch": 0.187936, + "grad_norm": 1.375, + "learning_rate": 8.186290322580645e-05, + "loss": 0.2142, + "step": 11746 + }, + { + "epoch": 0.187952, + "grad_norm": 0.7265625, + "learning_rate": 8.186129032258065e-05, + "loss": 0.1963, + "step": 11747 + }, + { + "epoch": 0.187968, + "grad_norm": 0.91015625, + "learning_rate": 8.185967741935484e-05, + "loss": 0.1773, + "step": 11748 + }, + { + "epoch": 0.187984, + "grad_norm": 0.61328125, + "learning_rate": 8.185806451612904e-05, + "loss": 0.1496, + "step": 11749 + }, + { + "epoch": 0.188, + "grad_norm": 0.67578125, + "learning_rate": 8.185645161290323e-05, + "loss": 0.1558, + "step": 11750 + }, + { + "epoch": 0.188016, + "grad_norm": 1.0625, + "learning_rate": 8.185483870967743e-05, + "loss": 0.2001, + "step": 11751 + }, + { + "epoch": 0.188032, + "grad_norm": 1.25, + "learning_rate": 8.185322580645161e-05, + "loss": 0.2103, + "step": 11752 + }, + { + "epoch": 0.188048, + "grad_norm": 1.015625, + "learning_rate": 8.185161290322581e-05, + "loss": 0.2269, + "step": 11753 + }, + { + "epoch": 0.188064, + "grad_norm": 0.78125, + "learning_rate": 8.185000000000001e-05, + "loss": 0.1663, + "step": 11754 + }, + { + "epoch": 0.18808, + "grad_norm": 0.78515625, + "learning_rate": 8.18483870967742e-05, + "loss": 0.1757, + "step": 11755 + }, + { + "epoch": 0.188096, + "grad_norm": 1.046875, + "learning_rate": 8.18467741935484e-05, + "loss": 0.1994, + "step": 11756 + }, + { + "epoch": 0.188112, + "grad_norm": 0.81640625, + "learning_rate": 8.184516129032258e-05, + "loss": 0.2263, + "step": 11757 + }, + { + "epoch": 0.188128, + "grad_norm": 0.67578125, + "learning_rate": 8.184354838709678e-05, + "loss": 0.1757, + "step": 11758 + }, + { + "epoch": 0.188144, + "grad_norm": 0.7578125, + "learning_rate": 8.184193548387097e-05, + "loss": 0.1693, + "step": 11759 + }, + { + "epoch": 0.18816, + "grad_norm": 0.74609375, + "learning_rate": 8.184032258064517e-05, + "loss": 0.2191, + "step": 11760 + }, + { + "epoch": 0.188176, + "grad_norm": 0.91796875, + "learning_rate": 8.183870967741935e-05, + "loss": 0.1643, + "step": 11761 + }, + { + "epoch": 0.188192, + "grad_norm": 1.0625, + "learning_rate": 8.183709677419355e-05, + "loss": 0.1999, + "step": 11762 + }, + { + "epoch": 0.188208, + "grad_norm": 1.328125, + "learning_rate": 8.183548387096774e-05, + "loss": 0.1445, + "step": 11763 + }, + { + "epoch": 0.188224, + "grad_norm": 0.83984375, + "learning_rate": 8.183387096774194e-05, + "loss": 0.1916, + "step": 11764 + }, + { + "epoch": 0.18824, + "grad_norm": 0.5390625, + "learning_rate": 8.183225806451613e-05, + "loss": 0.138, + "step": 11765 + }, + { + "epoch": 0.188256, + "grad_norm": 1.3828125, + "learning_rate": 8.183064516129033e-05, + "loss": 0.1942, + "step": 11766 + }, + { + "epoch": 0.188272, + "grad_norm": 1.109375, + "learning_rate": 8.182903225806452e-05, + "loss": 0.1834, + "step": 11767 + }, + { + "epoch": 0.188288, + "grad_norm": 0.6953125, + "learning_rate": 8.182741935483872e-05, + "loss": 0.1724, + "step": 11768 + }, + { + "epoch": 0.188304, + "grad_norm": 0.66796875, + "learning_rate": 8.182580645161291e-05, + "loss": 0.1566, + "step": 11769 + }, + { + "epoch": 0.18832, + "grad_norm": 0.98046875, + "learning_rate": 8.18241935483871e-05, + "loss": 0.1905, + "step": 11770 + }, + { + "epoch": 0.188336, + "grad_norm": 0.66796875, + "learning_rate": 8.18225806451613e-05, + "loss": 0.1889, + "step": 11771 + }, + { + "epoch": 0.188352, + "grad_norm": 0.65234375, + "learning_rate": 8.182096774193548e-05, + "loss": 0.1641, + "step": 11772 + }, + { + "epoch": 0.188368, + "grad_norm": 0.98046875, + "learning_rate": 8.181935483870968e-05, + "loss": 0.1671, + "step": 11773 + }, + { + "epoch": 0.188384, + "grad_norm": 0.68359375, + "learning_rate": 8.181774193548387e-05, + "loss": 0.1761, + "step": 11774 + }, + { + "epoch": 0.1884, + "grad_norm": 1.1484375, + "learning_rate": 8.181612903225807e-05, + "loss": 0.1948, + "step": 11775 + }, + { + "epoch": 0.188416, + "grad_norm": 0.63671875, + "learning_rate": 8.181451612903225e-05, + "loss": 0.1615, + "step": 11776 + }, + { + "epoch": 0.188432, + "grad_norm": 0.609375, + "learning_rate": 8.181290322580645e-05, + "loss": 0.1519, + "step": 11777 + }, + { + "epoch": 0.188448, + "grad_norm": 0.71875, + "learning_rate": 8.181129032258065e-05, + "loss": 0.166, + "step": 11778 + }, + { + "epoch": 0.188464, + "grad_norm": 1.1171875, + "learning_rate": 8.180967741935485e-05, + "loss": 0.1458, + "step": 11779 + }, + { + "epoch": 0.18848, + "grad_norm": 1.1328125, + "learning_rate": 8.180806451612904e-05, + "loss": 0.1725, + "step": 11780 + }, + { + "epoch": 0.188496, + "grad_norm": 0.6484375, + "learning_rate": 8.180645161290324e-05, + "loss": 0.1849, + "step": 11781 + }, + { + "epoch": 0.188512, + "grad_norm": 0.87109375, + "learning_rate": 8.180483870967742e-05, + "loss": 0.1583, + "step": 11782 + }, + { + "epoch": 0.188528, + "grad_norm": 0.74609375, + "learning_rate": 8.180322580645162e-05, + "loss": 0.1685, + "step": 11783 + }, + { + "epoch": 0.188544, + "grad_norm": 0.78515625, + "learning_rate": 8.180161290322581e-05, + "loss": 0.1662, + "step": 11784 + }, + { + "epoch": 0.18856, + "grad_norm": 0.65625, + "learning_rate": 8.18e-05, + "loss": 0.1331, + "step": 11785 + }, + { + "epoch": 0.188576, + "grad_norm": 0.5546875, + "learning_rate": 8.17983870967742e-05, + "loss": 0.1492, + "step": 11786 + }, + { + "epoch": 0.188592, + "grad_norm": 0.76171875, + "learning_rate": 8.179677419354838e-05, + "loss": 0.1427, + "step": 11787 + }, + { + "epoch": 0.188608, + "grad_norm": 0.6484375, + "learning_rate": 8.179516129032258e-05, + "loss": 0.1588, + "step": 11788 + }, + { + "epoch": 0.188624, + "grad_norm": 1.5078125, + "learning_rate": 8.179354838709678e-05, + "loss": 0.1787, + "step": 11789 + }, + { + "epoch": 0.18864, + "grad_norm": 0.75, + "learning_rate": 8.179193548387098e-05, + "loss": 0.1879, + "step": 11790 + }, + { + "epoch": 0.188656, + "grad_norm": 1.6953125, + "learning_rate": 8.179032258064517e-05, + "loss": 0.1767, + "step": 11791 + }, + { + "epoch": 0.188672, + "grad_norm": 1.328125, + "learning_rate": 8.178870967741937e-05, + "loss": 0.1727, + "step": 11792 + }, + { + "epoch": 0.188688, + "grad_norm": 1.2109375, + "learning_rate": 8.178709677419355e-05, + "loss": 0.1867, + "step": 11793 + }, + { + "epoch": 0.188704, + "grad_norm": 1.0078125, + "learning_rate": 8.178548387096775e-05, + "loss": 0.1711, + "step": 11794 + }, + { + "epoch": 0.18872, + "grad_norm": 0.546875, + "learning_rate": 8.178387096774194e-05, + "loss": 0.1545, + "step": 11795 + }, + { + "epoch": 0.188736, + "grad_norm": 0.74609375, + "learning_rate": 8.178225806451614e-05, + "loss": 0.1861, + "step": 11796 + }, + { + "epoch": 0.188752, + "grad_norm": 0.7421875, + "learning_rate": 8.178064516129032e-05, + "loss": 0.1624, + "step": 11797 + }, + { + "epoch": 0.188768, + "grad_norm": 0.87109375, + "learning_rate": 8.177903225806452e-05, + "loss": 0.143, + "step": 11798 + }, + { + "epoch": 0.188784, + "grad_norm": 0.91796875, + "learning_rate": 8.177741935483871e-05, + "loss": 0.185, + "step": 11799 + }, + { + "epoch": 0.1888, + "grad_norm": 0.87109375, + "learning_rate": 8.17758064516129e-05, + "loss": 0.1613, + "step": 11800 + }, + { + "epoch": 0.188816, + "grad_norm": 0.9453125, + "learning_rate": 8.17741935483871e-05, + "loss": 0.2045, + "step": 11801 + }, + { + "epoch": 0.188832, + "grad_norm": 0.56640625, + "learning_rate": 8.17725806451613e-05, + "loss": 0.1763, + "step": 11802 + }, + { + "epoch": 0.188848, + "grad_norm": 0.5703125, + "learning_rate": 8.17709677419355e-05, + "loss": 0.1717, + "step": 11803 + }, + { + "epoch": 0.188864, + "grad_norm": 0.7734375, + "learning_rate": 8.176935483870968e-05, + "loss": 0.1842, + "step": 11804 + }, + { + "epoch": 0.18888, + "grad_norm": 0.7421875, + "learning_rate": 8.176774193548388e-05, + "loss": 0.1532, + "step": 11805 + }, + { + "epoch": 0.188896, + "grad_norm": 0.6015625, + "learning_rate": 8.176612903225807e-05, + "loss": 0.1883, + "step": 11806 + }, + { + "epoch": 0.188912, + "grad_norm": 0.68359375, + "learning_rate": 8.176451612903227e-05, + "loss": 0.1786, + "step": 11807 + }, + { + "epoch": 0.188928, + "grad_norm": 1.0390625, + "learning_rate": 8.176290322580645e-05, + "loss": 0.1761, + "step": 11808 + }, + { + "epoch": 0.188944, + "grad_norm": 0.85546875, + "learning_rate": 8.176129032258065e-05, + "loss": 0.212, + "step": 11809 + }, + { + "epoch": 0.18896, + "grad_norm": 0.98046875, + "learning_rate": 8.175967741935484e-05, + "loss": 0.1688, + "step": 11810 + }, + { + "epoch": 0.188976, + "grad_norm": 1.0, + "learning_rate": 8.175806451612904e-05, + "loss": 0.1846, + "step": 11811 + }, + { + "epoch": 0.188992, + "grad_norm": 0.62109375, + "learning_rate": 8.175645161290322e-05, + "loss": 0.1667, + "step": 11812 + }, + { + "epoch": 0.189008, + "grad_norm": 0.69921875, + "learning_rate": 8.175483870967742e-05, + "loss": 0.1782, + "step": 11813 + }, + { + "epoch": 0.189024, + "grad_norm": 0.93359375, + "learning_rate": 8.175322580645162e-05, + "loss": 0.1796, + "step": 11814 + }, + { + "epoch": 0.18904, + "grad_norm": 1.0078125, + "learning_rate": 8.175161290322581e-05, + "loss": 0.1626, + "step": 11815 + }, + { + "epoch": 0.189056, + "grad_norm": 0.796875, + "learning_rate": 8.175000000000001e-05, + "loss": 0.1419, + "step": 11816 + }, + { + "epoch": 0.189072, + "grad_norm": 0.96484375, + "learning_rate": 8.17483870967742e-05, + "loss": 0.2111, + "step": 11817 + }, + { + "epoch": 0.189088, + "grad_norm": 0.5703125, + "learning_rate": 8.17467741935484e-05, + "loss": 0.1743, + "step": 11818 + }, + { + "epoch": 0.189104, + "grad_norm": 1.0078125, + "learning_rate": 8.174516129032258e-05, + "loss": 0.1832, + "step": 11819 + }, + { + "epoch": 0.18912, + "grad_norm": 0.609375, + "learning_rate": 8.174354838709678e-05, + "loss": 0.148, + "step": 11820 + }, + { + "epoch": 0.189136, + "grad_norm": 0.71484375, + "learning_rate": 8.174193548387097e-05, + "loss": 0.1555, + "step": 11821 + }, + { + "epoch": 0.189152, + "grad_norm": 0.578125, + "learning_rate": 8.174032258064517e-05, + "loss": 0.1527, + "step": 11822 + }, + { + "epoch": 0.189168, + "grad_norm": 0.70703125, + "learning_rate": 8.173870967741935e-05, + "loss": 0.1813, + "step": 11823 + }, + { + "epoch": 0.189184, + "grad_norm": 0.8125, + "learning_rate": 8.173709677419355e-05, + "loss": 0.1921, + "step": 11824 + }, + { + "epoch": 0.1892, + "grad_norm": 0.640625, + "learning_rate": 8.173548387096774e-05, + "loss": 0.1319, + "step": 11825 + }, + { + "epoch": 0.189216, + "grad_norm": 0.7578125, + "learning_rate": 8.173387096774194e-05, + "loss": 0.1814, + "step": 11826 + }, + { + "epoch": 0.189232, + "grad_norm": 0.62890625, + "learning_rate": 8.173225806451614e-05, + "loss": 0.1808, + "step": 11827 + }, + { + "epoch": 0.189248, + "grad_norm": 0.734375, + "learning_rate": 8.173064516129034e-05, + "loss": 0.1779, + "step": 11828 + }, + { + "epoch": 0.189264, + "grad_norm": 1.3203125, + "learning_rate": 8.172903225806452e-05, + "loss": 0.1779, + "step": 11829 + }, + { + "epoch": 0.18928, + "grad_norm": 0.78515625, + "learning_rate": 8.172741935483872e-05, + "loss": 0.1516, + "step": 11830 + }, + { + "epoch": 0.189296, + "grad_norm": 1.0, + "learning_rate": 8.172580645161291e-05, + "loss": 0.1968, + "step": 11831 + }, + { + "epoch": 0.189312, + "grad_norm": 0.77734375, + "learning_rate": 8.17241935483871e-05, + "loss": 0.1938, + "step": 11832 + }, + { + "epoch": 0.189328, + "grad_norm": 0.671875, + "learning_rate": 8.17225806451613e-05, + "loss": 0.174, + "step": 11833 + }, + { + "epoch": 0.189344, + "grad_norm": 0.64453125, + "learning_rate": 8.172096774193548e-05, + "loss": 0.1553, + "step": 11834 + }, + { + "epoch": 0.18936, + "grad_norm": 0.8671875, + "learning_rate": 8.171935483870968e-05, + "loss": 0.1979, + "step": 11835 + }, + { + "epoch": 0.189376, + "grad_norm": 0.65234375, + "learning_rate": 8.171774193548387e-05, + "loss": 0.1997, + "step": 11836 + }, + { + "epoch": 0.189392, + "grad_norm": 0.6796875, + "learning_rate": 8.171612903225807e-05, + "loss": 0.1593, + "step": 11837 + }, + { + "epoch": 0.189408, + "grad_norm": 0.859375, + "learning_rate": 8.171451612903226e-05, + "loss": 0.1684, + "step": 11838 + }, + { + "epoch": 0.189424, + "grad_norm": 0.69140625, + "learning_rate": 8.171290322580646e-05, + "loss": 0.2207, + "step": 11839 + }, + { + "epoch": 0.18944, + "grad_norm": 0.71484375, + "learning_rate": 8.171129032258065e-05, + "loss": 0.143, + "step": 11840 + }, + { + "epoch": 0.189456, + "grad_norm": 1.015625, + "learning_rate": 8.170967741935485e-05, + "loss": 0.2107, + "step": 11841 + }, + { + "epoch": 0.189472, + "grad_norm": 0.84765625, + "learning_rate": 8.170806451612904e-05, + "loss": 0.1406, + "step": 11842 + }, + { + "epoch": 0.189488, + "grad_norm": 0.71875, + "learning_rate": 8.170645161290324e-05, + "loss": 0.1632, + "step": 11843 + }, + { + "epoch": 0.189504, + "grad_norm": 0.69921875, + "learning_rate": 8.170483870967742e-05, + "loss": 0.194, + "step": 11844 + }, + { + "epoch": 0.18952, + "grad_norm": 0.71875, + "learning_rate": 8.170322580645162e-05, + "loss": 0.1515, + "step": 11845 + }, + { + "epoch": 0.189536, + "grad_norm": 0.703125, + "learning_rate": 8.170161290322581e-05, + "loss": 0.1596, + "step": 11846 + }, + { + "epoch": 0.189552, + "grad_norm": 0.53515625, + "learning_rate": 8.17e-05, + "loss": 0.1616, + "step": 11847 + }, + { + "epoch": 0.189568, + "grad_norm": 0.68359375, + "learning_rate": 8.16983870967742e-05, + "loss": 0.1903, + "step": 11848 + }, + { + "epoch": 0.189584, + "grad_norm": 0.92578125, + "learning_rate": 8.169677419354839e-05, + "loss": 0.1855, + "step": 11849 + }, + { + "epoch": 0.1896, + "grad_norm": 0.65234375, + "learning_rate": 8.169516129032259e-05, + "loss": 0.1769, + "step": 11850 + }, + { + "epoch": 0.189616, + "grad_norm": 1.078125, + "learning_rate": 8.169354838709678e-05, + "loss": 0.1889, + "step": 11851 + }, + { + "epoch": 0.189632, + "grad_norm": 1.09375, + "learning_rate": 8.169193548387098e-05, + "loss": 0.2284, + "step": 11852 + }, + { + "epoch": 0.189648, + "grad_norm": 0.88671875, + "learning_rate": 8.169032258064516e-05, + "loss": 0.1745, + "step": 11853 + }, + { + "epoch": 0.189664, + "grad_norm": 0.66796875, + "learning_rate": 8.168870967741936e-05, + "loss": 0.1863, + "step": 11854 + }, + { + "epoch": 0.18968, + "grad_norm": 0.65625, + "learning_rate": 8.168709677419355e-05, + "loss": 0.1375, + "step": 11855 + }, + { + "epoch": 0.189696, + "grad_norm": 0.5625, + "learning_rate": 8.168548387096775e-05, + "loss": 0.1741, + "step": 11856 + }, + { + "epoch": 0.189712, + "grad_norm": 0.7265625, + "learning_rate": 8.168387096774194e-05, + "loss": 0.1357, + "step": 11857 + }, + { + "epoch": 0.189728, + "grad_norm": 0.7890625, + "learning_rate": 8.168225806451614e-05, + "loss": 0.1918, + "step": 11858 + }, + { + "epoch": 0.189744, + "grad_norm": 0.96875, + "learning_rate": 8.168064516129032e-05, + "loss": 0.1886, + "step": 11859 + }, + { + "epoch": 0.18976, + "grad_norm": 0.81640625, + "learning_rate": 8.167903225806452e-05, + "loss": 0.1799, + "step": 11860 + }, + { + "epoch": 0.189776, + "grad_norm": 0.734375, + "learning_rate": 8.167741935483871e-05, + "loss": 0.1738, + "step": 11861 + }, + { + "epoch": 0.189792, + "grad_norm": 0.671875, + "learning_rate": 8.167580645161291e-05, + "loss": 0.1568, + "step": 11862 + }, + { + "epoch": 0.189808, + "grad_norm": 0.609375, + "learning_rate": 8.16741935483871e-05, + "loss": 0.1549, + "step": 11863 + }, + { + "epoch": 0.189824, + "grad_norm": 0.7578125, + "learning_rate": 8.167258064516129e-05, + "loss": 0.1727, + "step": 11864 + }, + { + "epoch": 0.18984, + "grad_norm": 0.8203125, + "learning_rate": 8.167096774193549e-05, + "loss": 0.1692, + "step": 11865 + }, + { + "epoch": 0.189856, + "grad_norm": 0.65625, + "learning_rate": 8.166935483870968e-05, + "loss": 0.1662, + "step": 11866 + }, + { + "epoch": 0.189872, + "grad_norm": 0.671875, + "learning_rate": 8.166774193548388e-05, + "loss": 0.156, + "step": 11867 + }, + { + "epoch": 0.189888, + "grad_norm": 0.9296875, + "learning_rate": 8.166612903225806e-05, + "loss": 0.2154, + "step": 11868 + }, + { + "epoch": 0.189904, + "grad_norm": 0.69921875, + "learning_rate": 8.166451612903226e-05, + "loss": 0.1458, + "step": 11869 + }, + { + "epoch": 0.18992, + "grad_norm": 0.6640625, + "learning_rate": 8.166290322580645e-05, + "loss": 0.1763, + "step": 11870 + }, + { + "epoch": 0.189936, + "grad_norm": 0.90234375, + "learning_rate": 8.166129032258065e-05, + "loss": 0.1738, + "step": 11871 + }, + { + "epoch": 0.189952, + "grad_norm": 0.73046875, + "learning_rate": 8.165967741935484e-05, + "loss": 0.1282, + "step": 11872 + }, + { + "epoch": 0.189968, + "grad_norm": 0.703125, + "learning_rate": 8.165806451612903e-05, + "loss": 0.1729, + "step": 11873 + }, + { + "epoch": 0.189984, + "grad_norm": 1.125, + "learning_rate": 8.165645161290323e-05, + "loss": 0.1918, + "step": 11874 + }, + { + "epoch": 0.19, + "grad_norm": 0.64453125, + "learning_rate": 8.165483870967743e-05, + "loss": 0.1431, + "step": 11875 + }, + { + "epoch": 0.190016, + "grad_norm": 0.74609375, + "learning_rate": 8.165322580645162e-05, + "loss": 0.1337, + "step": 11876 + }, + { + "epoch": 0.190032, + "grad_norm": 0.50390625, + "learning_rate": 8.165161290322582e-05, + "loss": 0.1308, + "step": 11877 + }, + { + "epoch": 0.190048, + "grad_norm": 1.03125, + "learning_rate": 8.165e-05, + "loss": 0.1978, + "step": 11878 + }, + { + "epoch": 0.190064, + "grad_norm": 0.69921875, + "learning_rate": 8.164838709677419e-05, + "loss": 0.1601, + "step": 11879 + }, + { + "epoch": 0.19008, + "grad_norm": 1.0, + "learning_rate": 8.164677419354839e-05, + "loss": 0.1987, + "step": 11880 + }, + { + "epoch": 0.190096, + "grad_norm": 0.69921875, + "learning_rate": 8.164516129032258e-05, + "loss": 0.1673, + "step": 11881 + }, + { + "epoch": 0.190112, + "grad_norm": 1.0234375, + "learning_rate": 8.164354838709678e-05, + "loss": 0.1659, + "step": 11882 + }, + { + "epoch": 0.190128, + "grad_norm": 0.58203125, + "learning_rate": 8.164193548387096e-05, + "loss": 0.1569, + "step": 11883 + }, + { + "epoch": 0.190144, + "grad_norm": 0.69140625, + "learning_rate": 8.164032258064516e-05, + "loss": 0.1894, + "step": 11884 + }, + { + "epoch": 0.19016, + "grad_norm": 0.89453125, + "learning_rate": 8.163870967741936e-05, + "loss": 0.2247, + "step": 11885 + }, + { + "epoch": 0.190176, + "grad_norm": 1.2578125, + "learning_rate": 8.163709677419355e-05, + "loss": 0.2484, + "step": 11886 + }, + { + "epoch": 0.190192, + "grad_norm": 0.83203125, + "learning_rate": 8.163548387096775e-05, + "loss": 0.1584, + "step": 11887 + }, + { + "epoch": 0.190208, + "grad_norm": 0.73046875, + "learning_rate": 8.163387096774195e-05, + "loss": 0.1516, + "step": 11888 + }, + { + "epoch": 0.190224, + "grad_norm": 0.6171875, + "learning_rate": 8.163225806451613e-05, + "loss": 0.1572, + "step": 11889 + }, + { + "epoch": 0.19024, + "grad_norm": 1.1328125, + "learning_rate": 8.163064516129033e-05, + "loss": 0.1681, + "step": 11890 + }, + { + "epoch": 0.190256, + "grad_norm": 0.83203125, + "learning_rate": 8.162903225806452e-05, + "loss": 0.1858, + "step": 11891 + }, + { + "epoch": 0.190272, + "grad_norm": 0.703125, + "learning_rate": 8.162741935483872e-05, + "loss": 0.1856, + "step": 11892 + }, + { + "epoch": 0.190288, + "grad_norm": 0.97265625, + "learning_rate": 8.16258064516129e-05, + "loss": 0.2081, + "step": 11893 + }, + { + "epoch": 0.190304, + "grad_norm": 0.72265625, + "learning_rate": 8.162419354838709e-05, + "loss": 0.1729, + "step": 11894 + }, + { + "epoch": 0.19032, + "grad_norm": 0.59375, + "learning_rate": 8.162258064516129e-05, + "loss": 0.14, + "step": 11895 + }, + { + "epoch": 0.190336, + "grad_norm": 0.671875, + "learning_rate": 8.162096774193548e-05, + "loss": 0.209, + "step": 11896 + }, + { + "epoch": 0.190352, + "grad_norm": 1.125, + "learning_rate": 8.161935483870968e-05, + "loss": 0.1572, + "step": 11897 + }, + { + "epoch": 0.190368, + "grad_norm": 0.6953125, + "learning_rate": 8.161774193548388e-05, + "loss": 0.1597, + "step": 11898 + }, + { + "epoch": 0.190384, + "grad_norm": 0.71875, + "learning_rate": 8.161612903225808e-05, + "loss": 0.186, + "step": 11899 + }, + { + "epoch": 0.1904, + "grad_norm": 1.046875, + "learning_rate": 8.161451612903226e-05, + "loss": 0.1794, + "step": 11900 + }, + { + "epoch": 0.190416, + "grad_norm": 0.6953125, + "learning_rate": 8.161290322580646e-05, + "loss": 0.1776, + "step": 11901 + }, + { + "epoch": 0.190432, + "grad_norm": 0.93359375, + "learning_rate": 8.161129032258065e-05, + "loss": 0.1443, + "step": 11902 + }, + { + "epoch": 0.190448, + "grad_norm": 1.09375, + "learning_rate": 8.160967741935485e-05, + "loss": 0.1626, + "step": 11903 + }, + { + "epoch": 0.190464, + "grad_norm": 0.78515625, + "learning_rate": 8.160806451612903e-05, + "loss": 0.2084, + "step": 11904 + }, + { + "epoch": 0.19048, + "grad_norm": 0.80078125, + "learning_rate": 8.160645161290323e-05, + "loss": 0.1847, + "step": 11905 + }, + { + "epoch": 0.190496, + "grad_norm": 1.0859375, + "learning_rate": 8.160483870967742e-05, + "loss": 0.1838, + "step": 11906 + }, + { + "epoch": 0.190512, + "grad_norm": 0.99609375, + "learning_rate": 8.160322580645162e-05, + "loss": 0.1841, + "step": 11907 + }, + { + "epoch": 0.190528, + "grad_norm": 0.5234375, + "learning_rate": 8.16016129032258e-05, + "loss": 0.143, + "step": 11908 + }, + { + "epoch": 0.190544, + "grad_norm": 0.61328125, + "learning_rate": 8.16e-05, + "loss": 0.2129, + "step": 11909 + }, + { + "epoch": 0.19056, + "grad_norm": 1.1953125, + "learning_rate": 8.15983870967742e-05, + "loss": 0.1723, + "step": 11910 + }, + { + "epoch": 0.190576, + "grad_norm": 0.56640625, + "learning_rate": 8.159677419354839e-05, + "loss": 0.1284, + "step": 11911 + }, + { + "epoch": 0.190592, + "grad_norm": 0.9375, + "learning_rate": 8.159516129032259e-05, + "loss": 0.2074, + "step": 11912 + }, + { + "epoch": 0.190608, + "grad_norm": 1.1484375, + "learning_rate": 8.159354838709678e-05, + "loss": 0.1905, + "step": 11913 + }, + { + "epoch": 0.190624, + "grad_norm": 0.73828125, + "learning_rate": 8.159193548387098e-05, + "loss": 0.1929, + "step": 11914 + }, + { + "epoch": 0.19064, + "grad_norm": 1.359375, + "learning_rate": 8.159032258064516e-05, + "loss": 0.1772, + "step": 11915 + }, + { + "epoch": 0.190656, + "grad_norm": 0.93359375, + "learning_rate": 8.158870967741936e-05, + "loss": 0.1642, + "step": 11916 + }, + { + "epoch": 0.190672, + "grad_norm": 0.9453125, + "learning_rate": 8.158709677419355e-05, + "loss": 0.1563, + "step": 11917 + }, + { + "epoch": 0.190688, + "grad_norm": 0.7890625, + "learning_rate": 8.158548387096775e-05, + "loss": 0.1964, + "step": 11918 + }, + { + "epoch": 0.190704, + "grad_norm": 0.734375, + "learning_rate": 8.158387096774193e-05, + "loss": 0.1582, + "step": 11919 + }, + { + "epoch": 0.19072, + "grad_norm": 0.96484375, + "learning_rate": 8.158225806451613e-05, + "loss": 0.2306, + "step": 11920 + }, + { + "epoch": 0.190736, + "grad_norm": 0.4921875, + "learning_rate": 8.158064516129032e-05, + "loss": 0.1567, + "step": 11921 + }, + { + "epoch": 0.190752, + "grad_norm": 0.625, + "learning_rate": 8.157903225806452e-05, + "loss": 0.2066, + "step": 11922 + }, + { + "epoch": 0.190768, + "grad_norm": 0.77734375, + "learning_rate": 8.157741935483872e-05, + "loss": 0.1827, + "step": 11923 + }, + { + "epoch": 0.190784, + "grad_norm": 0.59375, + "learning_rate": 8.15758064516129e-05, + "loss": 0.1495, + "step": 11924 + }, + { + "epoch": 0.1908, + "grad_norm": 1.0625, + "learning_rate": 8.15741935483871e-05, + "loss": 0.1605, + "step": 11925 + }, + { + "epoch": 0.190816, + "grad_norm": 0.77734375, + "learning_rate": 8.157258064516129e-05, + "loss": 0.1725, + "step": 11926 + }, + { + "epoch": 0.190832, + "grad_norm": 0.875, + "learning_rate": 8.157096774193549e-05, + "loss": 0.1624, + "step": 11927 + }, + { + "epoch": 0.190848, + "grad_norm": 0.62109375, + "learning_rate": 8.156935483870968e-05, + "loss": 0.1711, + "step": 11928 + }, + { + "epoch": 0.190864, + "grad_norm": 0.69921875, + "learning_rate": 8.156774193548388e-05, + "loss": 0.1741, + "step": 11929 + }, + { + "epoch": 0.19088, + "grad_norm": 0.8515625, + "learning_rate": 8.156612903225806e-05, + "loss": 0.133, + "step": 11930 + }, + { + "epoch": 0.190896, + "grad_norm": 0.703125, + "learning_rate": 8.156451612903226e-05, + "loss": 0.1753, + "step": 11931 + }, + { + "epoch": 0.190912, + "grad_norm": 0.96484375, + "learning_rate": 8.156290322580645e-05, + "loss": 0.1737, + "step": 11932 + }, + { + "epoch": 0.190928, + "grad_norm": 0.73046875, + "learning_rate": 8.156129032258065e-05, + "loss": 0.1813, + "step": 11933 + }, + { + "epoch": 0.190944, + "grad_norm": 0.71875, + "learning_rate": 8.155967741935485e-05, + "loss": 0.1477, + "step": 11934 + }, + { + "epoch": 0.19096, + "grad_norm": 0.9140625, + "learning_rate": 8.155806451612905e-05, + "loss": 0.1872, + "step": 11935 + }, + { + "epoch": 0.190976, + "grad_norm": 0.99609375, + "learning_rate": 8.155645161290323e-05, + "loss": 0.1918, + "step": 11936 + }, + { + "epoch": 0.190992, + "grad_norm": 0.6015625, + "learning_rate": 8.155483870967743e-05, + "loss": 0.1449, + "step": 11937 + }, + { + "epoch": 0.191008, + "grad_norm": 0.828125, + "learning_rate": 8.155322580645162e-05, + "loss": 0.1408, + "step": 11938 + }, + { + "epoch": 0.191024, + "grad_norm": 1.1171875, + "learning_rate": 8.155161290322582e-05, + "loss": 0.1845, + "step": 11939 + }, + { + "epoch": 0.19104, + "grad_norm": 0.859375, + "learning_rate": 8.155e-05, + "loss": 0.1856, + "step": 11940 + }, + { + "epoch": 0.191056, + "grad_norm": 0.58203125, + "learning_rate": 8.154838709677419e-05, + "loss": 0.1769, + "step": 11941 + }, + { + "epoch": 0.191072, + "grad_norm": 1.125, + "learning_rate": 8.154677419354839e-05, + "loss": 0.1756, + "step": 11942 + }, + { + "epoch": 0.191088, + "grad_norm": 1.5703125, + "learning_rate": 8.154516129032258e-05, + "loss": 0.1936, + "step": 11943 + }, + { + "epoch": 0.191104, + "grad_norm": 0.6640625, + "learning_rate": 8.154354838709677e-05, + "loss": 0.1781, + "step": 11944 + }, + { + "epoch": 0.19112, + "grad_norm": 1.015625, + "learning_rate": 8.154193548387097e-05, + "loss": 0.1908, + "step": 11945 + }, + { + "epoch": 0.191136, + "grad_norm": 1.2890625, + "learning_rate": 8.154032258064517e-05, + "loss": 0.2165, + "step": 11946 + }, + { + "epoch": 0.191152, + "grad_norm": 0.55859375, + "learning_rate": 8.153870967741936e-05, + "loss": 0.149, + "step": 11947 + }, + { + "epoch": 0.191168, + "grad_norm": 1.484375, + "learning_rate": 8.153709677419356e-05, + "loss": 0.1877, + "step": 11948 + }, + { + "epoch": 0.191184, + "grad_norm": 0.703125, + "learning_rate": 8.153548387096775e-05, + "loss": 0.1539, + "step": 11949 + }, + { + "epoch": 0.1912, + "grad_norm": 0.9453125, + "learning_rate": 8.153387096774195e-05, + "loss": 0.1881, + "step": 11950 + }, + { + "epoch": 0.191216, + "grad_norm": 0.6796875, + "learning_rate": 8.153225806451613e-05, + "loss": 0.1992, + "step": 11951 + }, + { + "epoch": 0.191232, + "grad_norm": 1.0078125, + "learning_rate": 8.153064516129033e-05, + "loss": 0.1678, + "step": 11952 + }, + { + "epoch": 0.191248, + "grad_norm": 0.7578125, + "learning_rate": 8.152903225806452e-05, + "loss": 0.1645, + "step": 11953 + }, + { + "epoch": 0.191264, + "grad_norm": 0.75, + "learning_rate": 8.152741935483872e-05, + "loss": 0.1634, + "step": 11954 + }, + { + "epoch": 0.19128, + "grad_norm": 1.609375, + "learning_rate": 8.15258064516129e-05, + "loss": 0.2071, + "step": 11955 + }, + { + "epoch": 0.191296, + "grad_norm": 0.890625, + "learning_rate": 8.152419354838709e-05, + "loss": 0.1893, + "step": 11956 + }, + { + "epoch": 0.191312, + "grad_norm": 0.76953125, + "learning_rate": 8.152258064516129e-05, + "loss": 0.1728, + "step": 11957 + }, + { + "epoch": 0.191328, + "grad_norm": 1.984375, + "learning_rate": 8.152096774193549e-05, + "loss": 0.2387, + "step": 11958 + }, + { + "epoch": 0.191344, + "grad_norm": 0.60546875, + "learning_rate": 8.151935483870969e-05, + "loss": 0.1949, + "step": 11959 + }, + { + "epoch": 0.19136, + "grad_norm": 0.494140625, + "learning_rate": 8.151774193548387e-05, + "loss": 0.1426, + "step": 11960 + }, + { + "epoch": 0.191376, + "grad_norm": 0.5546875, + "learning_rate": 8.151612903225807e-05, + "loss": 0.1599, + "step": 11961 + }, + { + "epoch": 0.191392, + "grad_norm": 0.67578125, + "learning_rate": 8.151451612903226e-05, + "loss": 0.1756, + "step": 11962 + }, + { + "epoch": 0.191408, + "grad_norm": 0.60546875, + "learning_rate": 8.151290322580646e-05, + "loss": 0.1509, + "step": 11963 + }, + { + "epoch": 0.191424, + "grad_norm": 0.58984375, + "learning_rate": 8.151129032258065e-05, + "loss": 0.1677, + "step": 11964 + }, + { + "epoch": 0.19144, + "grad_norm": 0.71875, + "learning_rate": 8.150967741935485e-05, + "loss": 0.1858, + "step": 11965 + }, + { + "epoch": 0.191456, + "grad_norm": 0.7265625, + "learning_rate": 8.150806451612903e-05, + "loss": 0.1741, + "step": 11966 + }, + { + "epoch": 0.191472, + "grad_norm": 0.734375, + "learning_rate": 8.150645161290323e-05, + "loss": 0.1919, + "step": 11967 + }, + { + "epoch": 0.191488, + "grad_norm": 1.0625, + "learning_rate": 8.150483870967742e-05, + "loss": 0.1871, + "step": 11968 + }, + { + "epoch": 0.191504, + "grad_norm": 0.52734375, + "learning_rate": 8.150322580645162e-05, + "loss": 0.1406, + "step": 11969 + }, + { + "epoch": 0.19152, + "grad_norm": 0.90625, + "learning_rate": 8.150161290322582e-05, + "loss": 0.1842, + "step": 11970 + }, + { + "epoch": 0.191536, + "grad_norm": 0.578125, + "learning_rate": 8.15e-05, + "loss": 0.167, + "step": 11971 + }, + { + "epoch": 0.191552, + "grad_norm": 0.6640625, + "learning_rate": 8.14983870967742e-05, + "loss": 0.1892, + "step": 11972 + }, + { + "epoch": 0.191568, + "grad_norm": 0.8984375, + "learning_rate": 8.149677419354839e-05, + "loss": 0.206, + "step": 11973 + }, + { + "epoch": 0.191584, + "grad_norm": 0.66015625, + "learning_rate": 8.149516129032259e-05, + "loss": 0.1523, + "step": 11974 + }, + { + "epoch": 0.1916, + "grad_norm": 0.65625, + "learning_rate": 8.149354838709677e-05, + "loss": 0.1579, + "step": 11975 + }, + { + "epoch": 0.191616, + "grad_norm": 0.76171875, + "learning_rate": 8.149193548387097e-05, + "loss": 0.177, + "step": 11976 + }, + { + "epoch": 0.191632, + "grad_norm": 1.6015625, + "learning_rate": 8.149032258064516e-05, + "loss": 0.1818, + "step": 11977 + }, + { + "epoch": 0.191648, + "grad_norm": 0.91015625, + "learning_rate": 8.148870967741936e-05, + "loss": 0.1722, + "step": 11978 + }, + { + "epoch": 0.191664, + "grad_norm": 0.71875, + "learning_rate": 8.148709677419355e-05, + "loss": 0.186, + "step": 11979 + }, + { + "epoch": 0.19168, + "grad_norm": 0.65234375, + "learning_rate": 8.148548387096774e-05, + "loss": 0.1677, + "step": 11980 + }, + { + "epoch": 0.191696, + "grad_norm": 0.94921875, + "learning_rate": 8.148387096774193e-05, + "loss": 0.1743, + "step": 11981 + }, + { + "epoch": 0.191712, + "grad_norm": 0.84765625, + "learning_rate": 8.148225806451613e-05, + "loss": 0.1682, + "step": 11982 + }, + { + "epoch": 0.191728, + "grad_norm": 1.7890625, + "learning_rate": 8.148064516129033e-05, + "loss": 0.1973, + "step": 11983 + }, + { + "epoch": 0.191744, + "grad_norm": 0.5703125, + "learning_rate": 8.147903225806453e-05, + "loss": 0.1673, + "step": 11984 + }, + { + "epoch": 0.19176, + "grad_norm": 0.7109375, + "learning_rate": 8.147741935483872e-05, + "loss": 0.1794, + "step": 11985 + }, + { + "epoch": 0.191776, + "grad_norm": 1.7421875, + "learning_rate": 8.14758064516129e-05, + "loss": 0.1803, + "step": 11986 + }, + { + "epoch": 0.191792, + "grad_norm": 0.77734375, + "learning_rate": 8.14741935483871e-05, + "loss": 0.1747, + "step": 11987 + }, + { + "epoch": 0.191808, + "grad_norm": 0.83984375, + "learning_rate": 8.147258064516129e-05, + "loss": 0.1757, + "step": 11988 + }, + { + "epoch": 0.191824, + "grad_norm": 0.734375, + "learning_rate": 8.147096774193549e-05, + "loss": 0.1569, + "step": 11989 + }, + { + "epoch": 0.19184, + "grad_norm": 0.8046875, + "learning_rate": 8.146935483870967e-05, + "loss": 0.1541, + "step": 11990 + }, + { + "epoch": 0.191856, + "grad_norm": 0.74609375, + "learning_rate": 8.146774193548387e-05, + "loss": 0.16, + "step": 11991 + }, + { + "epoch": 0.191872, + "grad_norm": 1.0078125, + "learning_rate": 8.146612903225806e-05, + "loss": 0.176, + "step": 11992 + }, + { + "epoch": 0.191888, + "grad_norm": 0.76953125, + "learning_rate": 8.146451612903226e-05, + "loss": 0.1452, + "step": 11993 + }, + { + "epoch": 0.191904, + "grad_norm": 0.76953125, + "learning_rate": 8.146290322580646e-05, + "loss": 0.1893, + "step": 11994 + }, + { + "epoch": 0.19192, + "grad_norm": 0.8515625, + "learning_rate": 8.146129032258066e-05, + "loss": 0.1444, + "step": 11995 + }, + { + "epoch": 0.191936, + "grad_norm": 0.65234375, + "learning_rate": 8.145967741935484e-05, + "loss": 0.1599, + "step": 11996 + }, + { + "epoch": 0.191952, + "grad_norm": 0.75, + "learning_rate": 8.145806451612904e-05, + "loss": 0.1713, + "step": 11997 + }, + { + "epoch": 0.191968, + "grad_norm": 0.76171875, + "learning_rate": 8.145645161290323e-05, + "loss": 0.1782, + "step": 11998 + }, + { + "epoch": 0.191984, + "grad_norm": 1.03125, + "learning_rate": 8.145483870967743e-05, + "loss": 0.18, + "step": 11999 + }, + { + "epoch": 0.192, + "grad_norm": 0.76171875, + "learning_rate": 8.145322580645162e-05, + "loss": 0.2018, + "step": 12000 + }, + { + "epoch": 0.192016, + "grad_norm": 0.98046875, + "learning_rate": 8.145161290322582e-05, + "loss": 0.157, + "step": 12001 + }, + { + "epoch": 0.192032, + "grad_norm": 0.72265625, + "learning_rate": 8.145e-05, + "loss": 0.2087, + "step": 12002 + }, + { + "epoch": 0.192048, + "grad_norm": 1.0703125, + "learning_rate": 8.144838709677419e-05, + "loss": 0.228, + "step": 12003 + }, + { + "epoch": 0.192064, + "grad_norm": 1.1796875, + "learning_rate": 8.144677419354839e-05, + "loss": 0.1512, + "step": 12004 + }, + { + "epoch": 0.19208, + "grad_norm": 0.5859375, + "learning_rate": 8.144516129032259e-05, + "loss": 0.1682, + "step": 12005 + }, + { + "epoch": 0.192096, + "grad_norm": 0.91015625, + "learning_rate": 8.144354838709679e-05, + "loss": 0.1486, + "step": 12006 + }, + { + "epoch": 0.192112, + "grad_norm": 1.28125, + "learning_rate": 8.144193548387097e-05, + "loss": 0.2095, + "step": 12007 + }, + { + "epoch": 0.192128, + "grad_norm": 0.65234375, + "learning_rate": 8.144032258064517e-05, + "loss": 0.1398, + "step": 12008 + }, + { + "epoch": 0.192144, + "grad_norm": 0.953125, + "learning_rate": 8.143870967741936e-05, + "loss": 0.1058, + "step": 12009 + }, + { + "epoch": 0.19216, + "grad_norm": 0.671875, + "learning_rate": 8.143709677419356e-05, + "loss": 0.1678, + "step": 12010 + }, + { + "epoch": 0.192176, + "grad_norm": 1.015625, + "learning_rate": 8.143548387096774e-05, + "loss": 0.1366, + "step": 12011 + }, + { + "epoch": 0.192192, + "grad_norm": 0.71484375, + "learning_rate": 8.143387096774194e-05, + "loss": 0.178, + "step": 12012 + }, + { + "epoch": 0.192208, + "grad_norm": 0.546875, + "learning_rate": 8.143225806451613e-05, + "loss": 0.1335, + "step": 12013 + }, + { + "epoch": 0.192224, + "grad_norm": 0.91015625, + "learning_rate": 8.143064516129033e-05, + "loss": 0.1176, + "step": 12014 + }, + { + "epoch": 0.19224, + "grad_norm": 1.3203125, + "learning_rate": 8.142903225806452e-05, + "loss": 0.1799, + "step": 12015 + }, + { + "epoch": 0.192256, + "grad_norm": 0.7421875, + "learning_rate": 8.142741935483871e-05, + "loss": 0.1879, + "step": 12016 + }, + { + "epoch": 0.192272, + "grad_norm": 0.65625, + "learning_rate": 8.14258064516129e-05, + "loss": 0.1544, + "step": 12017 + }, + { + "epoch": 0.192288, + "grad_norm": 0.77734375, + "learning_rate": 8.14241935483871e-05, + "loss": 0.1833, + "step": 12018 + }, + { + "epoch": 0.192304, + "grad_norm": 0.7421875, + "learning_rate": 8.14225806451613e-05, + "loss": 0.1928, + "step": 12019 + }, + { + "epoch": 0.19232, + "grad_norm": 1.0703125, + "learning_rate": 8.142096774193549e-05, + "loss": 0.1658, + "step": 12020 + }, + { + "epoch": 0.192336, + "grad_norm": 0.74609375, + "learning_rate": 8.141935483870969e-05, + "loss": 0.189, + "step": 12021 + }, + { + "epoch": 0.192352, + "grad_norm": 0.546875, + "learning_rate": 8.141774193548387e-05, + "loss": 0.1168, + "step": 12022 + }, + { + "epoch": 0.192368, + "grad_norm": 0.80859375, + "learning_rate": 8.141612903225807e-05, + "loss": 0.2125, + "step": 12023 + }, + { + "epoch": 0.192384, + "grad_norm": 0.6640625, + "learning_rate": 8.141451612903226e-05, + "loss": 0.187, + "step": 12024 + }, + { + "epoch": 0.1924, + "grad_norm": 0.87109375, + "learning_rate": 8.141290322580646e-05, + "loss": 0.1665, + "step": 12025 + }, + { + "epoch": 0.192416, + "grad_norm": 0.9921875, + "learning_rate": 8.141129032258064e-05, + "loss": 0.1607, + "step": 12026 + }, + { + "epoch": 0.192432, + "grad_norm": 0.7421875, + "learning_rate": 8.140967741935484e-05, + "loss": 0.1551, + "step": 12027 + }, + { + "epoch": 0.192448, + "grad_norm": 0.55859375, + "learning_rate": 8.140806451612903e-05, + "loss": 0.1531, + "step": 12028 + }, + { + "epoch": 0.192464, + "grad_norm": 0.81640625, + "learning_rate": 8.140645161290323e-05, + "loss": 0.1802, + "step": 12029 + }, + { + "epoch": 0.19248, + "grad_norm": 0.61328125, + "learning_rate": 8.140483870967743e-05, + "loss": 0.1597, + "step": 12030 + }, + { + "epoch": 0.192496, + "grad_norm": 0.5625, + "learning_rate": 8.140322580645163e-05, + "loss": 0.1184, + "step": 12031 + }, + { + "epoch": 0.192512, + "grad_norm": 0.6796875, + "learning_rate": 8.140161290322581e-05, + "loss": 0.1725, + "step": 12032 + }, + { + "epoch": 0.192528, + "grad_norm": 0.77734375, + "learning_rate": 8.14e-05, + "loss": 0.174, + "step": 12033 + }, + { + "epoch": 0.192544, + "grad_norm": 0.703125, + "learning_rate": 8.13983870967742e-05, + "loss": 0.1714, + "step": 12034 + }, + { + "epoch": 0.19256, + "grad_norm": 0.7578125, + "learning_rate": 8.139677419354839e-05, + "loss": 0.1922, + "step": 12035 + }, + { + "epoch": 0.192576, + "grad_norm": 0.486328125, + "learning_rate": 8.139516129032259e-05, + "loss": 0.159, + "step": 12036 + }, + { + "epoch": 0.192592, + "grad_norm": 1.171875, + "learning_rate": 8.139354838709677e-05, + "loss": 0.1534, + "step": 12037 + }, + { + "epoch": 0.192608, + "grad_norm": 0.82421875, + "learning_rate": 8.139193548387097e-05, + "loss": 0.1872, + "step": 12038 + }, + { + "epoch": 0.192624, + "grad_norm": 0.8828125, + "learning_rate": 8.139032258064516e-05, + "loss": 0.1959, + "step": 12039 + }, + { + "epoch": 0.19264, + "grad_norm": 0.67578125, + "learning_rate": 8.138870967741936e-05, + "loss": 0.1618, + "step": 12040 + }, + { + "epoch": 0.192656, + "grad_norm": 0.67578125, + "learning_rate": 8.138709677419356e-05, + "loss": 0.1648, + "step": 12041 + }, + { + "epoch": 0.192672, + "grad_norm": 1.1171875, + "learning_rate": 8.138548387096776e-05, + "loss": 0.2303, + "step": 12042 + }, + { + "epoch": 0.192688, + "grad_norm": 0.98046875, + "learning_rate": 8.138387096774194e-05, + "loss": 0.1595, + "step": 12043 + }, + { + "epoch": 0.192704, + "grad_norm": 0.703125, + "learning_rate": 8.138225806451614e-05, + "loss": 0.1584, + "step": 12044 + }, + { + "epoch": 0.19272, + "grad_norm": 0.609375, + "learning_rate": 8.138064516129033e-05, + "loss": 0.1857, + "step": 12045 + }, + { + "epoch": 0.192736, + "grad_norm": 0.78125, + "learning_rate": 8.137903225806453e-05, + "loss": 0.202, + "step": 12046 + }, + { + "epoch": 0.192752, + "grad_norm": 1.09375, + "learning_rate": 8.137741935483871e-05, + "loss": 0.1729, + "step": 12047 + }, + { + "epoch": 0.192768, + "grad_norm": 0.66796875, + "learning_rate": 8.137580645161291e-05, + "loss": 0.1547, + "step": 12048 + }, + { + "epoch": 0.192784, + "grad_norm": 1.1328125, + "learning_rate": 8.13741935483871e-05, + "loss": 0.2029, + "step": 12049 + }, + { + "epoch": 0.1928, + "grad_norm": 0.81640625, + "learning_rate": 8.137258064516129e-05, + "loss": 0.1664, + "step": 12050 + }, + { + "epoch": 0.192816, + "grad_norm": 0.49609375, + "learning_rate": 8.137096774193548e-05, + "loss": 0.1722, + "step": 12051 + }, + { + "epoch": 0.192832, + "grad_norm": 0.8203125, + "learning_rate": 8.136935483870967e-05, + "loss": 0.1888, + "step": 12052 + }, + { + "epoch": 0.192848, + "grad_norm": 0.98828125, + "learning_rate": 8.136774193548387e-05, + "loss": 0.1363, + "step": 12053 + }, + { + "epoch": 0.192864, + "grad_norm": 1.40625, + "learning_rate": 8.136612903225807e-05, + "loss": 0.1707, + "step": 12054 + }, + { + "epoch": 0.19288, + "grad_norm": 0.60546875, + "learning_rate": 8.136451612903227e-05, + "loss": 0.1562, + "step": 12055 + }, + { + "epoch": 0.192896, + "grad_norm": 1.1015625, + "learning_rate": 8.136290322580646e-05, + "loss": 0.1891, + "step": 12056 + }, + { + "epoch": 0.192912, + "grad_norm": 0.91796875, + "learning_rate": 8.136129032258066e-05, + "loss": 0.1921, + "step": 12057 + }, + { + "epoch": 0.192928, + "grad_norm": 1.3984375, + "learning_rate": 8.135967741935484e-05, + "loss": 0.1801, + "step": 12058 + }, + { + "epoch": 0.192944, + "grad_norm": 1.390625, + "learning_rate": 8.135806451612904e-05, + "loss": 0.1697, + "step": 12059 + }, + { + "epoch": 0.19296, + "grad_norm": 0.7109375, + "learning_rate": 8.135645161290323e-05, + "loss": 0.1813, + "step": 12060 + }, + { + "epoch": 0.192976, + "grad_norm": 0.6328125, + "learning_rate": 8.135483870967743e-05, + "loss": 0.1752, + "step": 12061 + }, + { + "epoch": 0.192992, + "grad_norm": 1.3203125, + "learning_rate": 8.135322580645161e-05, + "loss": 0.2173, + "step": 12062 + }, + { + "epoch": 0.193008, + "grad_norm": 0.609375, + "learning_rate": 8.135161290322581e-05, + "loss": 0.153, + "step": 12063 + }, + { + "epoch": 0.193024, + "grad_norm": 0.5234375, + "learning_rate": 8.135e-05, + "loss": 0.1572, + "step": 12064 + }, + { + "epoch": 0.19304, + "grad_norm": 0.78125, + "learning_rate": 8.13483870967742e-05, + "loss": 0.1846, + "step": 12065 + }, + { + "epoch": 0.193056, + "grad_norm": 0.6328125, + "learning_rate": 8.13467741935484e-05, + "loss": 0.1786, + "step": 12066 + }, + { + "epoch": 0.193072, + "grad_norm": 0.90625, + "learning_rate": 8.134516129032258e-05, + "loss": 0.1661, + "step": 12067 + }, + { + "epoch": 0.193088, + "grad_norm": 0.83203125, + "learning_rate": 8.134354838709678e-05, + "loss": 0.171, + "step": 12068 + }, + { + "epoch": 0.193104, + "grad_norm": 0.875, + "learning_rate": 8.134193548387097e-05, + "loss": 0.1798, + "step": 12069 + }, + { + "epoch": 0.19312, + "grad_norm": 0.71484375, + "learning_rate": 8.134032258064517e-05, + "loss": 0.1812, + "step": 12070 + }, + { + "epoch": 0.193136, + "grad_norm": 0.7421875, + "learning_rate": 8.133870967741936e-05, + "loss": 0.1941, + "step": 12071 + }, + { + "epoch": 0.193152, + "grad_norm": 0.59375, + "learning_rate": 8.133709677419356e-05, + "loss": 0.1695, + "step": 12072 + }, + { + "epoch": 0.193168, + "grad_norm": 1.046875, + "learning_rate": 8.133548387096774e-05, + "loss": 0.1642, + "step": 12073 + }, + { + "epoch": 0.193184, + "grad_norm": 0.80859375, + "learning_rate": 8.133387096774194e-05, + "loss": 0.1804, + "step": 12074 + }, + { + "epoch": 0.1932, + "grad_norm": 0.9921875, + "learning_rate": 8.133225806451613e-05, + "loss": 0.1946, + "step": 12075 + }, + { + "epoch": 0.193216, + "grad_norm": 1.203125, + "learning_rate": 8.133064516129033e-05, + "loss": 0.2143, + "step": 12076 + }, + { + "epoch": 0.193232, + "grad_norm": 0.90625, + "learning_rate": 8.132903225806451e-05, + "loss": 0.1608, + "step": 12077 + }, + { + "epoch": 0.193248, + "grad_norm": 0.83203125, + "learning_rate": 8.132741935483871e-05, + "loss": 0.164, + "step": 12078 + }, + { + "epoch": 0.193264, + "grad_norm": 1.078125, + "learning_rate": 8.132580645161291e-05, + "loss": 0.2016, + "step": 12079 + }, + { + "epoch": 0.19328, + "grad_norm": 0.88671875, + "learning_rate": 8.13241935483871e-05, + "loss": 0.192, + "step": 12080 + }, + { + "epoch": 0.193296, + "grad_norm": 0.859375, + "learning_rate": 8.13225806451613e-05, + "loss": 0.111, + "step": 12081 + }, + { + "epoch": 0.193312, + "grad_norm": 0.8671875, + "learning_rate": 8.132096774193548e-05, + "loss": 0.1485, + "step": 12082 + }, + { + "epoch": 0.193328, + "grad_norm": 0.6953125, + "learning_rate": 8.131935483870968e-05, + "loss": 0.1667, + "step": 12083 + }, + { + "epoch": 0.193344, + "grad_norm": 0.55078125, + "learning_rate": 8.131774193548387e-05, + "loss": 0.1931, + "step": 12084 + }, + { + "epoch": 0.19336, + "grad_norm": 1.265625, + "learning_rate": 8.131612903225807e-05, + "loss": 0.1854, + "step": 12085 + }, + { + "epoch": 0.193376, + "grad_norm": 0.734375, + "learning_rate": 8.131451612903226e-05, + "loss": 0.1901, + "step": 12086 + }, + { + "epoch": 0.193392, + "grad_norm": 0.62890625, + "learning_rate": 8.131290322580645e-05, + "loss": 0.1508, + "step": 12087 + }, + { + "epoch": 0.193408, + "grad_norm": 0.7734375, + "learning_rate": 8.131129032258064e-05, + "loss": 0.1578, + "step": 12088 + }, + { + "epoch": 0.193424, + "grad_norm": 0.59375, + "learning_rate": 8.130967741935484e-05, + "loss": 0.1929, + "step": 12089 + }, + { + "epoch": 0.19344, + "grad_norm": 0.66796875, + "learning_rate": 8.130806451612904e-05, + "loss": 0.1711, + "step": 12090 + }, + { + "epoch": 0.193456, + "grad_norm": 0.93359375, + "learning_rate": 8.130645161290324e-05, + "loss": 0.1416, + "step": 12091 + }, + { + "epoch": 0.193472, + "grad_norm": 1.0390625, + "learning_rate": 8.130483870967743e-05, + "loss": 0.1592, + "step": 12092 + }, + { + "epoch": 0.193488, + "grad_norm": 0.8203125, + "learning_rate": 8.130322580645163e-05, + "loss": 0.1681, + "step": 12093 + }, + { + "epoch": 0.193504, + "grad_norm": 0.78125, + "learning_rate": 8.130161290322581e-05, + "loss": 0.1595, + "step": 12094 + }, + { + "epoch": 0.19352, + "grad_norm": 0.671875, + "learning_rate": 8.13e-05, + "loss": 0.186, + "step": 12095 + }, + { + "epoch": 0.193536, + "grad_norm": 0.75390625, + "learning_rate": 8.12983870967742e-05, + "loss": 0.1877, + "step": 12096 + }, + { + "epoch": 0.193552, + "grad_norm": 0.8359375, + "learning_rate": 8.129677419354838e-05, + "loss": 0.1667, + "step": 12097 + }, + { + "epoch": 0.193568, + "grad_norm": 0.50390625, + "learning_rate": 8.129516129032258e-05, + "loss": 0.1691, + "step": 12098 + }, + { + "epoch": 0.193584, + "grad_norm": 0.7734375, + "learning_rate": 8.129354838709677e-05, + "loss": 0.1647, + "step": 12099 + }, + { + "epoch": 0.1936, + "grad_norm": 0.6875, + "learning_rate": 8.129193548387097e-05, + "loss": 0.1933, + "step": 12100 + }, + { + "epoch": 0.193616, + "grad_norm": 0.68359375, + "learning_rate": 8.129032258064517e-05, + "loss": 0.1766, + "step": 12101 + }, + { + "epoch": 0.193632, + "grad_norm": 1.015625, + "learning_rate": 8.128870967741937e-05, + "loss": 0.2322, + "step": 12102 + }, + { + "epoch": 0.193648, + "grad_norm": 0.67578125, + "learning_rate": 8.128709677419355e-05, + "loss": 0.1867, + "step": 12103 + }, + { + "epoch": 0.193664, + "grad_norm": 0.59765625, + "learning_rate": 8.128548387096775e-05, + "loss": 0.138, + "step": 12104 + }, + { + "epoch": 0.19368, + "grad_norm": 0.921875, + "learning_rate": 8.128387096774194e-05, + "loss": 0.1641, + "step": 12105 + }, + { + "epoch": 0.193696, + "grad_norm": 0.55078125, + "learning_rate": 8.128225806451614e-05, + "loss": 0.15, + "step": 12106 + }, + { + "epoch": 0.193712, + "grad_norm": 0.80859375, + "learning_rate": 8.128064516129033e-05, + "loss": 0.1457, + "step": 12107 + }, + { + "epoch": 0.193728, + "grad_norm": 0.77734375, + "learning_rate": 8.127903225806452e-05, + "loss": 0.1346, + "step": 12108 + }, + { + "epoch": 0.193744, + "grad_norm": 1.1953125, + "learning_rate": 8.127741935483871e-05, + "loss": 0.1804, + "step": 12109 + }, + { + "epoch": 0.19376, + "grad_norm": 0.7109375, + "learning_rate": 8.127580645161291e-05, + "loss": 0.1332, + "step": 12110 + }, + { + "epoch": 0.193776, + "grad_norm": 0.63671875, + "learning_rate": 8.12741935483871e-05, + "loss": 0.1553, + "step": 12111 + }, + { + "epoch": 0.193792, + "grad_norm": 0.83984375, + "learning_rate": 8.127258064516128e-05, + "loss": 0.1946, + "step": 12112 + }, + { + "epoch": 0.193808, + "grad_norm": 0.87109375, + "learning_rate": 8.127096774193548e-05, + "loss": 0.174, + "step": 12113 + }, + { + "epoch": 0.193824, + "grad_norm": 0.89453125, + "learning_rate": 8.126935483870968e-05, + "loss": 0.1923, + "step": 12114 + }, + { + "epoch": 0.19384, + "grad_norm": 0.64453125, + "learning_rate": 8.126774193548388e-05, + "loss": 0.1452, + "step": 12115 + }, + { + "epoch": 0.193856, + "grad_norm": 1.2734375, + "learning_rate": 8.126612903225807e-05, + "loss": 0.1872, + "step": 12116 + }, + { + "epoch": 0.193872, + "grad_norm": 1.0390625, + "learning_rate": 8.126451612903227e-05, + "loss": 0.1879, + "step": 12117 + }, + { + "epoch": 0.193888, + "grad_norm": 0.76953125, + "learning_rate": 8.126290322580645e-05, + "loss": 0.167, + "step": 12118 + }, + { + "epoch": 0.193904, + "grad_norm": 0.765625, + "learning_rate": 8.126129032258065e-05, + "loss": 0.1867, + "step": 12119 + }, + { + "epoch": 0.19392, + "grad_norm": 1.0390625, + "learning_rate": 8.125967741935484e-05, + "loss": 0.1867, + "step": 12120 + }, + { + "epoch": 0.193936, + "grad_norm": 0.78125, + "learning_rate": 8.125806451612904e-05, + "loss": 0.1586, + "step": 12121 + }, + { + "epoch": 0.193952, + "grad_norm": 1.1015625, + "learning_rate": 8.125645161290322e-05, + "loss": 0.1476, + "step": 12122 + }, + { + "epoch": 0.193968, + "grad_norm": 0.75390625, + "learning_rate": 8.125483870967742e-05, + "loss": 0.1974, + "step": 12123 + }, + { + "epoch": 0.193984, + "grad_norm": 1.1328125, + "learning_rate": 8.125322580645161e-05, + "loss": 0.1662, + "step": 12124 + }, + { + "epoch": 0.194, + "grad_norm": 1.109375, + "learning_rate": 8.125161290322581e-05, + "loss": 0.1659, + "step": 12125 + }, + { + "epoch": 0.194016, + "grad_norm": 0.478515625, + "learning_rate": 8.125000000000001e-05, + "loss": 0.148, + "step": 12126 + }, + { + "epoch": 0.194032, + "grad_norm": 0.7578125, + "learning_rate": 8.12483870967742e-05, + "loss": 0.1741, + "step": 12127 + }, + { + "epoch": 0.194048, + "grad_norm": 0.68359375, + "learning_rate": 8.12467741935484e-05, + "loss": 0.1897, + "step": 12128 + }, + { + "epoch": 0.194064, + "grad_norm": 0.859375, + "learning_rate": 8.124516129032258e-05, + "loss": 0.2078, + "step": 12129 + }, + { + "epoch": 0.19408, + "grad_norm": 0.73046875, + "learning_rate": 8.124354838709678e-05, + "loss": 0.1602, + "step": 12130 + }, + { + "epoch": 0.194096, + "grad_norm": 0.75390625, + "learning_rate": 8.124193548387097e-05, + "loss": 0.1594, + "step": 12131 + }, + { + "epoch": 0.194112, + "grad_norm": 0.8125, + "learning_rate": 8.124032258064517e-05, + "loss": 0.1788, + "step": 12132 + }, + { + "epoch": 0.194128, + "grad_norm": 1.203125, + "learning_rate": 8.123870967741935e-05, + "loss": 0.2022, + "step": 12133 + }, + { + "epoch": 0.194144, + "grad_norm": 1.234375, + "learning_rate": 8.123709677419355e-05, + "loss": 0.1839, + "step": 12134 + }, + { + "epoch": 0.19416, + "grad_norm": 1.0, + "learning_rate": 8.123548387096774e-05, + "loss": 0.1549, + "step": 12135 + }, + { + "epoch": 0.194176, + "grad_norm": 0.9296875, + "learning_rate": 8.123387096774194e-05, + "loss": 0.1506, + "step": 12136 + }, + { + "epoch": 0.194192, + "grad_norm": 0.79296875, + "learning_rate": 8.123225806451614e-05, + "loss": 0.1937, + "step": 12137 + }, + { + "epoch": 0.194208, + "grad_norm": 0.64453125, + "learning_rate": 8.123064516129032e-05, + "loss": 0.1785, + "step": 12138 + }, + { + "epoch": 0.194224, + "grad_norm": 0.62109375, + "learning_rate": 8.122903225806452e-05, + "loss": 0.1681, + "step": 12139 + }, + { + "epoch": 0.19424, + "grad_norm": 0.5859375, + "learning_rate": 8.122741935483872e-05, + "loss": 0.1451, + "step": 12140 + }, + { + "epoch": 0.194256, + "grad_norm": 1.1875, + "learning_rate": 8.122580645161291e-05, + "loss": 0.1667, + "step": 12141 + }, + { + "epoch": 0.194272, + "grad_norm": 1.1953125, + "learning_rate": 8.12241935483871e-05, + "loss": 0.1608, + "step": 12142 + }, + { + "epoch": 0.194288, + "grad_norm": 0.72265625, + "learning_rate": 8.12225806451613e-05, + "loss": 0.1764, + "step": 12143 + }, + { + "epoch": 0.194304, + "grad_norm": 0.6796875, + "learning_rate": 8.122096774193548e-05, + "loss": 0.177, + "step": 12144 + }, + { + "epoch": 0.19432, + "grad_norm": 0.8671875, + "learning_rate": 8.121935483870968e-05, + "loss": 0.1126, + "step": 12145 + }, + { + "epoch": 0.194336, + "grad_norm": 0.71875, + "learning_rate": 8.121774193548387e-05, + "loss": 0.2262, + "step": 12146 + }, + { + "epoch": 0.194352, + "grad_norm": 0.953125, + "learning_rate": 8.121612903225807e-05, + "loss": 0.1712, + "step": 12147 + }, + { + "epoch": 0.194368, + "grad_norm": 1.015625, + "learning_rate": 8.121451612903225e-05, + "loss": 0.2126, + "step": 12148 + }, + { + "epoch": 0.194384, + "grad_norm": 0.82421875, + "learning_rate": 8.121290322580645e-05, + "loss": 0.1356, + "step": 12149 + }, + { + "epoch": 0.1944, + "grad_norm": 0.84765625, + "learning_rate": 8.121129032258065e-05, + "loss": 0.2239, + "step": 12150 + }, + { + "epoch": 0.194416, + "grad_norm": 0.70703125, + "learning_rate": 8.120967741935485e-05, + "loss": 0.139, + "step": 12151 + }, + { + "epoch": 0.194432, + "grad_norm": 0.64453125, + "learning_rate": 8.120806451612904e-05, + "loss": 0.1564, + "step": 12152 + }, + { + "epoch": 0.194448, + "grad_norm": 1.09375, + "learning_rate": 8.120645161290324e-05, + "loss": 0.169, + "step": 12153 + }, + { + "epoch": 0.194464, + "grad_norm": 0.90234375, + "learning_rate": 8.120483870967742e-05, + "loss": 0.1695, + "step": 12154 + }, + { + "epoch": 0.19448, + "grad_norm": 1.0703125, + "learning_rate": 8.120322580645162e-05, + "loss": 0.1858, + "step": 12155 + }, + { + "epoch": 0.194496, + "grad_norm": 0.71875, + "learning_rate": 8.120161290322581e-05, + "loss": 0.1584, + "step": 12156 + }, + { + "epoch": 0.194512, + "grad_norm": 0.70703125, + "learning_rate": 8.120000000000001e-05, + "loss": 0.1821, + "step": 12157 + }, + { + "epoch": 0.194528, + "grad_norm": 0.87109375, + "learning_rate": 8.11983870967742e-05, + "loss": 0.1811, + "step": 12158 + }, + { + "epoch": 0.194544, + "grad_norm": 0.8515625, + "learning_rate": 8.119677419354838e-05, + "loss": 0.207, + "step": 12159 + }, + { + "epoch": 0.19456, + "grad_norm": 0.59375, + "learning_rate": 8.119516129032258e-05, + "loss": 0.1486, + "step": 12160 + }, + { + "epoch": 0.194576, + "grad_norm": 0.9453125, + "learning_rate": 8.119354838709678e-05, + "loss": 0.1467, + "step": 12161 + }, + { + "epoch": 0.194592, + "grad_norm": 0.8828125, + "learning_rate": 8.119193548387098e-05, + "loss": 0.1534, + "step": 12162 + }, + { + "epoch": 0.194608, + "grad_norm": 1.203125, + "learning_rate": 8.119032258064517e-05, + "loss": 0.1906, + "step": 12163 + }, + { + "epoch": 0.194624, + "grad_norm": 0.921875, + "learning_rate": 8.118870967741937e-05, + "loss": 0.2123, + "step": 12164 + }, + { + "epoch": 0.19464, + "grad_norm": 0.7265625, + "learning_rate": 8.118709677419355e-05, + "loss": 0.1516, + "step": 12165 + }, + { + "epoch": 0.194656, + "grad_norm": 0.8203125, + "learning_rate": 8.118548387096775e-05, + "loss": 0.1565, + "step": 12166 + }, + { + "epoch": 0.194672, + "grad_norm": 0.78515625, + "learning_rate": 8.118387096774194e-05, + "loss": 0.1937, + "step": 12167 + }, + { + "epoch": 0.194688, + "grad_norm": 0.6484375, + "learning_rate": 8.118225806451614e-05, + "loss": 0.1642, + "step": 12168 + }, + { + "epoch": 0.194704, + "grad_norm": 1.28125, + "learning_rate": 8.118064516129032e-05, + "loss": 0.1577, + "step": 12169 + }, + { + "epoch": 0.19472, + "grad_norm": 0.765625, + "learning_rate": 8.117903225806452e-05, + "loss": 0.1961, + "step": 12170 + }, + { + "epoch": 0.194736, + "grad_norm": 1.59375, + "learning_rate": 8.117741935483871e-05, + "loss": 0.2025, + "step": 12171 + }, + { + "epoch": 0.194752, + "grad_norm": 1.1171875, + "learning_rate": 8.117580645161291e-05, + "loss": 0.164, + "step": 12172 + }, + { + "epoch": 0.194768, + "grad_norm": 0.51953125, + "learning_rate": 8.11741935483871e-05, + "loss": 0.1416, + "step": 12173 + }, + { + "epoch": 0.194784, + "grad_norm": 0.98046875, + "learning_rate": 8.11725806451613e-05, + "loss": 0.1581, + "step": 12174 + }, + { + "epoch": 0.1948, + "grad_norm": 0.640625, + "learning_rate": 8.11709677419355e-05, + "loss": 0.1599, + "step": 12175 + }, + { + "epoch": 0.194816, + "grad_norm": 0.56640625, + "learning_rate": 8.116935483870968e-05, + "loss": 0.1409, + "step": 12176 + }, + { + "epoch": 0.194832, + "grad_norm": 1.046875, + "learning_rate": 8.116774193548388e-05, + "loss": 0.1616, + "step": 12177 + }, + { + "epoch": 0.194848, + "grad_norm": 0.68359375, + "learning_rate": 8.116612903225807e-05, + "loss": 0.1784, + "step": 12178 + }, + { + "epoch": 0.194864, + "grad_norm": 0.58203125, + "learning_rate": 8.116451612903226e-05, + "loss": 0.1564, + "step": 12179 + }, + { + "epoch": 0.19488, + "grad_norm": 0.84765625, + "learning_rate": 8.116290322580645e-05, + "loss": 0.1625, + "step": 12180 + }, + { + "epoch": 0.194896, + "grad_norm": 0.5703125, + "learning_rate": 8.116129032258065e-05, + "loss": 0.1424, + "step": 12181 + }, + { + "epoch": 0.194912, + "grad_norm": 1.28125, + "learning_rate": 8.115967741935484e-05, + "loss": 0.1892, + "step": 12182 + }, + { + "epoch": 0.194928, + "grad_norm": 0.51171875, + "learning_rate": 8.115806451612904e-05, + "loss": 0.1679, + "step": 12183 + }, + { + "epoch": 0.194944, + "grad_norm": 1.078125, + "learning_rate": 8.115645161290322e-05, + "loss": 0.187, + "step": 12184 + }, + { + "epoch": 0.19496, + "grad_norm": 0.69921875, + "learning_rate": 8.115483870967742e-05, + "loss": 0.1676, + "step": 12185 + }, + { + "epoch": 0.194976, + "grad_norm": 0.67578125, + "learning_rate": 8.115322580645162e-05, + "loss": 0.17, + "step": 12186 + }, + { + "epoch": 0.194992, + "grad_norm": 0.77734375, + "learning_rate": 8.115161290322582e-05, + "loss": 0.1781, + "step": 12187 + }, + { + "epoch": 0.195008, + "grad_norm": 0.765625, + "learning_rate": 8.115000000000001e-05, + "loss": 0.1647, + "step": 12188 + }, + { + "epoch": 0.195024, + "grad_norm": 0.62109375, + "learning_rate": 8.11483870967742e-05, + "loss": 0.1568, + "step": 12189 + }, + { + "epoch": 0.19504, + "grad_norm": 0.6328125, + "learning_rate": 8.114677419354839e-05, + "loss": 0.142, + "step": 12190 + }, + { + "epoch": 0.195056, + "grad_norm": 1.3359375, + "learning_rate": 8.114516129032258e-05, + "loss": 0.1246, + "step": 12191 + }, + { + "epoch": 0.195072, + "grad_norm": 0.66015625, + "learning_rate": 8.114354838709678e-05, + "loss": 0.1874, + "step": 12192 + }, + { + "epoch": 0.195088, + "grad_norm": 0.94921875, + "learning_rate": 8.114193548387096e-05, + "loss": 0.1348, + "step": 12193 + }, + { + "epoch": 0.195104, + "grad_norm": 0.62890625, + "learning_rate": 8.114032258064516e-05, + "loss": 0.1631, + "step": 12194 + }, + { + "epoch": 0.19512, + "grad_norm": 0.79296875, + "learning_rate": 8.113870967741935e-05, + "loss": 0.2125, + "step": 12195 + }, + { + "epoch": 0.195136, + "grad_norm": 0.578125, + "learning_rate": 8.113709677419355e-05, + "loss": 0.1679, + "step": 12196 + }, + { + "epoch": 0.195152, + "grad_norm": 0.9609375, + "learning_rate": 8.113548387096775e-05, + "loss": 0.1551, + "step": 12197 + }, + { + "epoch": 0.195168, + "grad_norm": 0.61328125, + "learning_rate": 8.113387096774195e-05, + "loss": 0.1657, + "step": 12198 + }, + { + "epoch": 0.195184, + "grad_norm": 0.5, + "learning_rate": 8.113225806451614e-05, + "loss": 0.1662, + "step": 12199 + }, + { + "epoch": 0.1952, + "grad_norm": 0.55078125, + "learning_rate": 8.113064516129034e-05, + "loss": 0.1484, + "step": 12200 + }, + { + "epoch": 0.195216, + "grad_norm": 0.71484375, + "learning_rate": 8.112903225806452e-05, + "loss": 0.1459, + "step": 12201 + }, + { + "epoch": 0.195232, + "grad_norm": 0.796875, + "learning_rate": 8.112741935483872e-05, + "loss": 0.216, + "step": 12202 + }, + { + "epoch": 0.195248, + "grad_norm": 0.98828125, + "learning_rate": 8.112580645161291e-05, + "loss": 0.2083, + "step": 12203 + }, + { + "epoch": 0.195264, + "grad_norm": 0.51171875, + "learning_rate": 8.112419354838709e-05, + "loss": 0.15, + "step": 12204 + }, + { + "epoch": 0.19528, + "grad_norm": 0.609375, + "learning_rate": 8.112258064516129e-05, + "loss": 0.1738, + "step": 12205 + }, + { + "epoch": 0.195296, + "grad_norm": 0.7265625, + "learning_rate": 8.112096774193548e-05, + "loss": 0.2009, + "step": 12206 + }, + { + "epoch": 0.195312, + "grad_norm": 0.546875, + "learning_rate": 8.111935483870968e-05, + "loss": 0.1495, + "step": 12207 + }, + { + "epoch": 0.195328, + "grad_norm": 0.8828125, + "learning_rate": 8.111774193548386e-05, + "loss": 0.1983, + "step": 12208 + }, + { + "epoch": 0.195344, + "grad_norm": 0.88671875, + "learning_rate": 8.111612903225806e-05, + "loss": 0.1825, + "step": 12209 + }, + { + "epoch": 0.19536, + "grad_norm": 0.6171875, + "learning_rate": 8.111451612903226e-05, + "loss": 0.1727, + "step": 12210 + }, + { + "epoch": 0.195376, + "grad_norm": 0.80078125, + "learning_rate": 8.111290322580646e-05, + "loss": 0.1722, + "step": 12211 + }, + { + "epoch": 0.195392, + "grad_norm": 0.63671875, + "learning_rate": 8.111129032258065e-05, + "loss": 0.1576, + "step": 12212 + }, + { + "epoch": 0.195408, + "grad_norm": 0.91796875, + "learning_rate": 8.110967741935485e-05, + "loss": 0.1755, + "step": 12213 + }, + { + "epoch": 0.195424, + "grad_norm": 0.875, + "learning_rate": 8.110806451612904e-05, + "loss": 0.1567, + "step": 12214 + }, + { + "epoch": 0.19544, + "grad_norm": 0.60546875, + "learning_rate": 8.110645161290323e-05, + "loss": 0.1743, + "step": 12215 + }, + { + "epoch": 0.195456, + "grad_norm": 1.0546875, + "learning_rate": 8.110483870967742e-05, + "loss": 0.1785, + "step": 12216 + }, + { + "epoch": 0.195472, + "grad_norm": 1.0859375, + "learning_rate": 8.110322580645162e-05, + "loss": 0.1545, + "step": 12217 + }, + { + "epoch": 0.195488, + "grad_norm": 0.91015625, + "learning_rate": 8.11016129032258e-05, + "loss": 0.1809, + "step": 12218 + }, + { + "epoch": 0.195504, + "grad_norm": 0.78125, + "learning_rate": 8.11e-05, + "loss": 0.1792, + "step": 12219 + }, + { + "epoch": 0.19552, + "grad_norm": 1.09375, + "learning_rate": 8.109838709677419e-05, + "loss": 0.1687, + "step": 12220 + }, + { + "epoch": 0.195536, + "grad_norm": 0.63671875, + "learning_rate": 8.109677419354839e-05, + "loss": 0.1857, + "step": 12221 + }, + { + "epoch": 0.195552, + "grad_norm": 0.6953125, + "learning_rate": 8.109516129032259e-05, + "loss": 0.1989, + "step": 12222 + }, + { + "epoch": 0.195568, + "grad_norm": 1.1171875, + "learning_rate": 8.109354838709678e-05, + "loss": 0.1726, + "step": 12223 + }, + { + "epoch": 0.195584, + "grad_norm": 0.80859375, + "learning_rate": 8.109193548387098e-05, + "loss": 0.154, + "step": 12224 + }, + { + "epoch": 0.1956, + "grad_norm": 0.78515625, + "learning_rate": 8.109032258064516e-05, + "loss": 0.1533, + "step": 12225 + }, + { + "epoch": 0.195616, + "grad_norm": 0.66796875, + "learning_rate": 8.108870967741936e-05, + "loss": 0.2213, + "step": 12226 + }, + { + "epoch": 0.195632, + "grad_norm": 0.671875, + "learning_rate": 8.108709677419355e-05, + "loss": 0.1181, + "step": 12227 + }, + { + "epoch": 0.195648, + "grad_norm": 1.09375, + "learning_rate": 8.108548387096775e-05, + "loss": 0.1945, + "step": 12228 + }, + { + "epoch": 0.195664, + "grad_norm": 0.88671875, + "learning_rate": 8.108387096774193e-05, + "loss": 0.1679, + "step": 12229 + }, + { + "epoch": 0.19568, + "grad_norm": 0.80859375, + "learning_rate": 8.108225806451613e-05, + "loss": 0.1737, + "step": 12230 + }, + { + "epoch": 0.195696, + "grad_norm": 0.7265625, + "learning_rate": 8.108064516129032e-05, + "loss": 0.2112, + "step": 12231 + }, + { + "epoch": 0.195712, + "grad_norm": 0.62890625, + "learning_rate": 8.107903225806452e-05, + "loss": 0.1768, + "step": 12232 + }, + { + "epoch": 0.195728, + "grad_norm": 0.83203125, + "learning_rate": 8.10774193548387e-05, + "loss": 0.1771, + "step": 12233 + }, + { + "epoch": 0.195744, + "grad_norm": 0.5078125, + "learning_rate": 8.10758064516129e-05, + "loss": 0.1596, + "step": 12234 + }, + { + "epoch": 0.19576, + "grad_norm": 0.61328125, + "learning_rate": 8.10741935483871e-05, + "loss": 0.1561, + "step": 12235 + }, + { + "epoch": 0.195776, + "grad_norm": 0.67578125, + "learning_rate": 8.107258064516129e-05, + "loss": 0.1539, + "step": 12236 + }, + { + "epoch": 0.195792, + "grad_norm": 0.79296875, + "learning_rate": 8.107096774193549e-05, + "loss": 0.1507, + "step": 12237 + }, + { + "epoch": 0.195808, + "grad_norm": 0.59375, + "learning_rate": 8.106935483870968e-05, + "loss": 0.1699, + "step": 12238 + }, + { + "epoch": 0.195824, + "grad_norm": 0.8984375, + "learning_rate": 8.106774193548388e-05, + "loss": 0.1706, + "step": 12239 + }, + { + "epoch": 0.19584, + "grad_norm": 0.6015625, + "learning_rate": 8.106612903225806e-05, + "loss": 0.1689, + "step": 12240 + }, + { + "epoch": 0.195856, + "grad_norm": 0.5703125, + "learning_rate": 8.106451612903226e-05, + "loss": 0.1462, + "step": 12241 + }, + { + "epoch": 0.195872, + "grad_norm": 0.7578125, + "learning_rate": 8.106290322580645e-05, + "loss": 0.1845, + "step": 12242 + }, + { + "epoch": 0.195888, + "grad_norm": 0.69140625, + "learning_rate": 8.106129032258065e-05, + "loss": 0.1715, + "step": 12243 + }, + { + "epoch": 0.195904, + "grad_norm": 0.73828125, + "learning_rate": 8.105967741935483e-05, + "loss": 0.1401, + "step": 12244 + }, + { + "epoch": 0.19592, + "grad_norm": 0.5625, + "learning_rate": 8.105806451612903e-05, + "loss": 0.1873, + "step": 12245 + }, + { + "epoch": 0.195936, + "grad_norm": 0.67578125, + "learning_rate": 8.105645161290323e-05, + "loss": 0.1485, + "step": 12246 + }, + { + "epoch": 0.195952, + "grad_norm": 1.1015625, + "learning_rate": 8.105483870967743e-05, + "loss": 0.2135, + "step": 12247 + }, + { + "epoch": 0.195968, + "grad_norm": 0.6875, + "learning_rate": 8.105322580645162e-05, + "loss": 0.1548, + "step": 12248 + }, + { + "epoch": 0.195984, + "grad_norm": 0.6015625, + "learning_rate": 8.105161290322582e-05, + "loss": 0.2036, + "step": 12249 + }, + { + "epoch": 0.196, + "grad_norm": 0.74609375, + "learning_rate": 8.105e-05, + "loss": 0.1566, + "step": 12250 + }, + { + "epoch": 0.196016, + "grad_norm": 1.046875, + "learning_rate": 8.104838709677419e-05, + "loss": 0.1512, + "step": 12251 + }, + { + "epoch": 0.196032, + "grad_norm": 0.87109375, + "learning_rate": 8.104677419354839e-05, + "loss": 0.1925, + "step": 12252 + }, + { + "epoch": 0.196048, + "grad_norm": 0.87109375, + "learning_rate": 8.104516129032258e-05, + "loss": 0.1524, + "step": 12253 + }, + { + "epoch": 0.196064, + "grad_norm": 0.8828125, + "learning_rate": 8.104354838709678e-05, + "loss": 0.1853, + "step": 12254 + }, + { + "epoch": 0.19608, + "grad_norm": 0.8203125, + "learning_rate": 8.104193548387096e-05, + "loss": 0.175, + "step": 12255 + }, + { + "epoch": 0.196096, + "grad_norm": 0.71484375, + "learning_rate": 8.104032258064516e-05, + "loss": 0.166, + "step": 12256 + }, + { + "epoch": 0.196112, + "grad_norm": 0.921875, + "learning_rate": 8.103870967741936e-05, + "loss": 0.12, + "step": 12257 + }, + { + "epoch": 0.196128, + "grad_norm": 0.953125, + "learning_rate": 8.103709677419356e-05, + "loss": 0.2063, + "step": 12258 + }, + { + "epoch": 0.196144, + "grad_norm": 0.8046875, + "learning_rate": 8.103548387096775e-05, + "loss": 0.1731, + "step": 12259 + }, + { + "epoch": 0.19616, + "grad_norm": 1.0, + "learning_rate": 8.103387096774195e-05, + "loss": 0.1774, + "step": 12260 + }, + { + "epoch": 0.196176, + "grad_norm": 1.03125, + "learning_rate": 8.103225806451613e-05, + "loss": 0.1717, + "step": 12261 + }, + { + "epoch": 0.196192, + "grad_norm": 1.0625, + "learning_rate": 8.103064516129033e-05, + "loss": 0.1558, + "step": 12262 + }, + { + "epoch": 0.196208, + "grad_norm": 1.046875, + "learning_rate": 8.102903225806452e-05, + "loss": 0.2342, + "step": 12263 + }, + { + "epoch": 0.196224, + "grad_norm": 0.60546875, + "learning_rate": 8.102741935483872e-05, + "loss": 0.1625, + "step": 12264 + }, + { + "epoch": 0.19624, + "grad_norm": 0.6953125, + "learning_rate": 8.10258064516129e-05, + "loss": 0.1362, + "step": 12265 + }, + { + "epoch": 0.196256, + "grad_norm": 0.80859375, + "learning_rate": 8.10241935483871e-05, + "loss": 0.1672, + "step": 12266 + }, + { + "epoch": 0.196272, + "grad_norm": 1.421875, + "learning_rate": 8.102258064516129e-05, + "loss": 0.1936, + "step": 12267 + }, + { + "epoch": 0.196288, + "grad_norm": 0.83984375, + "learning_rate": 8.102096774193548e-05, + "loss": 0.223, + "step": 12268 + }, + { + "epoch": 0.196304, + "grad_norm": 0.66796875, + "learning_rate": 8.101935483870968e-05, + "loss": 0.1559, + "step": 12269 + }, + { + "epoch": 0.19632, + "grad_norm": 0.69140625, + "learning_rate": 8.101774193548388e-05, + "loss": 0.136, + "step": 12270 + }, + { + "epoch": 0.196336, + "grad_norm": 0.8515625, + "learning_rate": 8.101612903225808e-05, + "loss": 0.2124, + "step": 12271 + }, + { + "epoch": 0.196352, + "grad_norm": 0.7578125, + "learning_rate": 8.101451612903226e-05, + "loss": 0.173, + "step": 12272 + }, + { + "epoch": 0.196368, + "grad_norm": 0.640625, + "learning_rate": 8.101290322580646e-05, + "loss": 0.1647, + "step": 12273 + }, + { + "epoch": 0.196384, + "grad_norm": 1.1953125, + "learning_rate": 8.101129032258065e-05, + "loss": 0.1915, + "step": 12274 + }, + { + "epoch": 0.1964, + "grad_norm": 1.2578125, + "learning_rate": 8.100967741935485e-05, + "loss": 0.173, + "step": 12275 + }, + { + "epoch": 0.196416, + "grad_norm": 1.15625, + "learning_rate": 8.100806451612903e-05, + "loss": 0.1724, + "step": 12276 + }, + { + "epoch": 0.196432, + "grad_norm": 0.62109375, + "learning_rate": 8.100645161290323e-05, + "loss": 0.146, + "step": 12277 + }, + { + "epoch": 0.196448, + "grad_norm": 0.69921875, + "learning_rate": 8.100483870967742e-05, + "loss": 0.1528, + "step": 12278 + }, + { + "epoch": 0.196464, + "grad_norm": 0.79296875, + "learning_rate": 8.100322580645162e-05, + "loss": 0.1611, + "step": 12279 + }, + { + "epoch": 0.19648, + "grad_norm": 0.9453125, + "learning_rate": 8.10016129032258e-05, + "loss": 0.1702, + "step": 12280 + }, + { + "epoch": 0.196496, + "grad_norm": 0.88671875, + "learning_rate": 8.1e-05, + "loss": 0.1682, + "step": 12281 + }, + { + "epoch": 0.196512, + "grad_norm": 0.65625, + "learning_rate": 8.09983870967742e-05, + "loss": 0.1471, + "step": 12282 + }, + { + "epoch": 0.196528, + "grad_norm": 1.0234375, + "learning_rate": 8.099677419354839e-05, + "loss": 0.1648, + "step": 12283 + }, + { + "epoch": 0.196544, + "grad_norm": 0.70703125, + "learning_rate": 8.099516129032259e-05, + "loss": 0.1633, + "step": 12284 + }, + { + "epoch": 0.19656, + "grad_norm": 0.6484375, + "learning_rate": 8.099354838709678e-05, + "loss": 0.1761, + "step": 12285 + }, + { + "epoch": 0.196576, + "grad_norm": 0.60546875, + "learning_rate": 8.099193548387097e-05, + "loss": 0.169, + "step": 12286 + }, + { + "epoch": 0.196592, + "grad_norm": 1.109375, + "learning_rate": 8.099032258064516e-05, + "loss": 0.1368, + "step": 12287 + }, + { + "epoch": 0.196608, + "grad_norm": 0.77734375, + "learning_rate": 8.098870967741936e-05, + "loss": 0.129, + "step": 12288 + }, + { + "epoch": 0.196624, + "grad_norm": 1.0, + "learning_rate": 8.098709677419355e-05, + "loss": 0.1496, + "step": 12289 + }, + { + "epoch": 0.19664, + "grad_norm": 0.703125, + "learning_rate": 8.098548387096775e-05, + "loss": 0.1909, + "step": 12290 + }, + { + "epoch": 0.196656, + "grad_norm": 0.76171875, + "learning_rate": 8.098387096774193e-05, + "loss": 0.1816, + "step": 12291 + }, + { + "epoch": 0.196672, + "grad_norm": 0.546875, + "learning_rate": 8.098225806451613e-05, + "loss": 0.1408, + "step": 12292 + }, + { + "epoch": 0.196688, + "grad_norm": 1.09375, + "learning_rate": 8.098064516129033e-05, + "loss": 0.1941, + "step": 12293 + }, + { + "epoch": 0.196704, + "grad_norm": 0.80859375, + "learning_rate": 8.097903225806453e-05, + "loss": 0.1764, + "step": 12294 + }, + { + "epoch": 0.19672, + "grad_norm": 1.15625, + "learning_rate": 8.097741935483872e-05, + "loss": 0.2216, + "step": 12295 + }, + { + "epoch": 0.196736, + "grad_norm": 1.1015625, + "learning_rate": 8.097580645161292e-05, + "loss": 0.1797, + "step": 12296 + }, + { + "epoch": 0.196752, + "grad_norm": 0.828125, + "learning_rate": 8.09741935483871e-05, + "loss": 0.1891, + "step": 12297 + }, + { + "epoch": 0.196768, + "grad_norm": 0.7109375, + "learning_rate": 8.097258064516129e-05, + "loss": 0.1641, + "step": 12298 + }, + { + "epoch": 0.196784, + "grad_norm": 0.75, + "learning_rate": 8.097096774193549e-05, + "loss": 0.1693, + "step": 12299 + }, + { + "epoch": 0.1968, + "grad_norm": 0.6640625, + "learning_rate": 8.096935483870967e-05, + "loss": 0.1522, + "step": 12300 + }, + { + "epoch": 0.196816, + "grad_norm": 0.671875, + "learning_rate": 8.096774193548387e-05, + "loss": 0.1568, + "step": 12301 + }, + { + "epoch": 0.196832, + "grad_norm": 0.72265625, + "learning_rate": 8.096612903225806e-05, + "loss": 0.1791, + "step": 12302 + }, + { + "epoch": 0.196848, + "grad_norm": 0.5703125, + "learning_rate": 8.096451612903226e-05, + "loss": 0.165, + "step": 12303 + }, + { + "epoch": 0.196864, + "grad_norm": 0.98828125, + "learning_rate": 8.096290322580645e-05, + "loss": 0.1916, + "step": 12304 + }, + { + "epoch": 0.19688, + "grad_norm": 0.796875, + "learning_rate": 8.096129032258065e-05, + "loss": 0.1238, + "step": 12305 + }, + { + "epoch": 0.196896, + "grad_norm": 0.56640625, + "learning_rate": 8.095967741935485e-05, + "loss": 0.1758, + "step": 12306 + }, + { + "epoch": 0.196912, + "grad_norm": 0.59765625, + "learning_rate": 8.095806451612905e-05, + "loss": 0.1174, + "step": 12307 + }, + { + "epoch": 0.196928, + "grad_norm": 0.62109375, + "learning_rate": 8.095645161290323e-05, + "loss": 0.1613, + "step": 12308 + }, + { + "epoch": 0.196944, + "grad_norm": 0.79296875, + "learning_rate": 8.095483870967743e-05, + "loss": 0.1662, + "step": 12309 + }, + { + "epoch": 0.19696, + "grad_norm": 0.953125, + "learning_rate": 8.095322580645162e-05, + "loss": 0.2133, + "step": 12310 + }, + { + "epoch": 0.196976, + "grad_norm": 0.6015625, + "learning_rate": 8.095161290322582e-05, + "loss": 0.1541, + "step": 12311 + }, + { + "epoch": 0.196992, + "grad_norm": 0.6640625, + "learning_rate": 8.095e-05, + "loss": 0.187, + "step": 12312 + }, + { + "epoch": 0.197008, + "grad_norm": 0.9140625, + "learning_rate": 8.094838709677419e-05, + "loss": 0.2014, + "step": 12313 + }, + { + "epoch": 0.197024, + "grad_norm": 0.58984375, + "learning_rate": 8.094677419354839e-05, + "loss": 0.1628, + "step": 12314 + }, + { + "epoch": 0.19704, + "grad_norm": 0.6484375, + "learning_rate": 8.094516129032257e-05, + "loss": 0.15, + "step": 12315 + }, + { + "epoch": 0.197056, + "grad_norm": 0.83984375, + "learning_rate": 8.094354838709677e-05, + "loss": 0.1846, + "step": 12316 + }, + { + "epoch": 0.197072, + "grad_norm": 0.7109375, + "learning_rate": 8.094193548387097e-05, + "loss": 0.1995, + "step": 12317 + }, + { + "epoch": 0.197088, + "grad_norm": 1.484375, + "learning_rate": 8.094032258064517e-05, + "loss": 0.1915, + "step": 12318 + }, + { + "epoch": 0.197104, + "grad_norm": 0.8125, + "learning_rate": 8.093870967741936e-05, + "loss": 0.1813, + "step": 12319 + }, + { + "epoch": 0.19712, + "grad_norm": 0.75, + "learning_rate": 8.093709677419356e-05, + "loss": 0.1849, + "step": 12320 + }, + { + "epoch": 0.197136, + "grad_norm": 1.0625, + "learning_rate": 8.093548387096774e-05, + "loss": 0.2195, + "step": 12321 + }, + { + "epoch": 0.197152, + "grad_norm": 0.83984375, + "learning_rate": 8.093387096774194e-05, + "loss": 0.1771, + "step": 12322 + }, + { + "epoch": 0.197168, + "grad_norm": 1.0859375, + "learning_rate": 8.093225806451613e-05, + "loss": 0.1866, + "step": 12323 + }, + { + "epoch": 0.197184, + "grad_norm": 0.7890625, + "learning_rate": 8.093064516129033e-05, + "loss": 0.1454, + "step": 12324 + }, + { + "epoch": 0.1972, + "grad_norm": 0.5234375, + "learning_rate": 8.092903225806452e-05, + "loss": 0.1327, + "step": 12325 + }, + { + "epoch": 0.197216, + "grad_norm": 0.90625, + "learning_rate": 8.092741935483872e-05, + "loss": 0.1905, + "step": 12326 + }, + { + "epoch": 0.197232, + "grad_norm": 0.9609375, + "learning_rate": 8.09258064516129e-05, + "loss": 0.2279, + "step": 12327 + }, + { + "epoch": 0.197248, + "grad_norm": 0.7421875, + "learning_rate": 8.09241935483871e-05, + "loss": 0.1585, + "step": 12328 + }, + { + "epoch": 0.197264, + "grad_norm": 1.0625, + "learning_rate": 8.092258064516129e-05, + "loss": 0.1601, + "step": 12329 + }, + { + "epoch": 0.19728, + "grad_norm": 0.80078125, + "learning_rate": 8.092096774193549e-05, + "loss": 0.171, + "step": 12330 + }, + { + "epoch": 0.197296, + "grad_norm": 0.578125, + "learning_rate": 8.091935483870969e-05, + "loss": 0.1619, + "step": 12331 + }, + { + "epoch": 0.197312, + "grad_norm": 0.625, + "learning_rate": 8.091774193548387e-05, + "loss": 0.1421, + "step": 12332 + }, + { + "epoch": 0.197328, + "grad_norm": 0.7890625, + "learning_rate": 8.091612903225807e-05, + "loss": 0.1795, + "step": 12333 + }, + { + "epoch": 0.197344, + "grad_norm": 0.96484375, + "learning_rate": 8.091451612903226e-05, + "loss": 0.1801, + "step": 12334 + }, + { + "epoch": 0.19736, + "grad_norm": 0.703125, + "learning_rate": 8.091290322580646e-05, + "loss": 0.1949, + "step": 12335 + }, + { + "epoch": 0.197376, + "grad_norm": 0.95703125, + "learning_rate": 8.091129032258064e-05, + "loss": 0.2004, + "step": 12336 + }, + { + "epoch": 0.197392, + "grad_norm": 1.3203125, + "learning_rate": 8.090967741935484e-05, + "loss": 0.2072, + "step": 12337 + }, + { + "epoch": 0.197408, + "grad_norm": 0.71484375, + "learning_rate": 8.090806451612903e-05, + "loss": 0.1937, + "step": 12338 + }, + { + "epoch": 0.197424, + "grad_norm": 1.015625, + "learning_rate": 8.090645161290323e-05, + "loss": 0.2079, + "step": 12339 + }, + { + "epoch": 0.19744, + "grad_norm": 1.1640625, + "learning_rate": 8.090483870967742e-05, + "loss": 0.1571, + "step": 12340 + }, + { + "epoch": 0.197456, + "grad_norm": 0.86328125, + "learning_rate": 8.090322580645162e-05, + "loss": 0.1603, + "step": 12341 + }, + { + "epoch": 0.197472, + "grad_norm": 0.69140625, + "learning_rate": 8.090161290322582e-05, + "loss": 0.1849, + "step": 12342 + }, + { + "epoch": 0.197488, + "grad_norm": 0.7109375, + "learning_rate": 8.090000000000001e-05, + "loss": 0.1649, + "step": 12343 + }, + { + "epoch": 0.197504, + "grad_norm": 0.7109375, + "learning_rate": 8.08983870967742e-05, + "loss": 0.1674, + "step": 12344 + }, + { + "epoch": 0.19752, + "grad_norm": 1.234375, + "learning_rate": 8.089677419354839e-05, + "loss": 0.1918, + "step": 12345 + }, + { + "epoch": 0.197536, + "grad_norm": 0.703125, + "learning_rate": 8.089516129032259e-05, + "loss": 0.1966, + "step": 12346 + }, + { + "epoch": 0.197552, + "grad_norm": 0.89453125, + "learning_rate": 8.089354838709677e-05, + "loss": 0.1713, + "step": 12347 + }, + { + "epoch": 0.197568, + "grad_norm": 0.76953125, + "learning_rate": 8.089193548387097e-05, + "loss": 0.1699, + "step": 12348 + }, + { + "epoch": 0.197584, + "grad_norm": 0.474609375, + "learning_rate": 8.089032258064516e-05, + "loss": 0.1211, + "step": 12349 + }, + { + "epoch": 0.1976, + "grad_norm": 1.15625, + "learning_rate": 8.088870967741936e-05, + "loss": 0.2076, + "step": 12350 + }, + { + "epoch": 0.197616, + "grad_norm": 0.640625, + "learning_rate": 8.088709677419354e-05, + "loss": 0.1332, + "step": 12351 + }, + { + "epoch": 0.197632, + "grad_norm": 0.6015625, + "learning_rate": 8.088548387096774e-05, + "loss": 0.1346, + "step": 12352 + }, + { + "epoch": 0.197648, + "grad_norm": 1.125, + "learning_rate": 8.088387096774194e-05, + "loss": 0.2035, + "step": 12353 + }, + { + "epoch": 0.197664, + "grad_norm": 0.5859375, + "learning_rate": 8.088225806451614e-05, + "loss": 0.1503, + "step": 12354 + }, + { + "epoch": 0.19768, + "grad_norm": 0.88671875, + "learning_rate": 8.088064516129033e-05, + "loss": 0.1803, + "step": 12355 + }, + { + "epoch": 0.197696, + "grad_norm": 0.87109375, + "learning_rate": 8.087903225806453e-05, + "loss": 0.1876, + "step": 12356 + }, + { + "epoch": 0.197712, + "grad_norm": 0.765625, + "learning_rate": 8.087741935483871e-05, + "loss": 0.1696, + "step": 12357 + }, + { + "epoch": 0.197728, + "grad_norm": 0.67578125, + "learning_rate": 8.087580645161291e-05, + "loss": 0.1784, + "step": 12358 + }, + { + "epoch": 0.197744, + "grad_norm": 0.60546875, + "learning_rate": 8.08741935483871e-05, + "loss": 0.1688, + "step": 12359 + }, + { + "epoch": 0.19776, + "grad_norm": 0.66015625, + "learning_rate": 8.087258064516129e-05, + "loss": 0.1506, + "step": 12360 + }, + { + "epoch": 0.197776, + "grad_norm": 0.6953125, + "learning_rate": 8.087096774193549e-05, + "loss": 0.1558, + "step": 12361 + }, + { + "epoch": 0.197792, + "grad_norm": 1.0234375, + "learning_rate": 8.086935483870967e-05, + "loss": 0.185, + "step": 12362 + }, + { + "epoch": 0.197808, + "grad_norm": 1.2421875, + "learning_rate": 8.086774193548387e-05, + "loss": 0.1382, + "step": 12363 + }, + { + "epoch": 0.197824, + "grad_norm": 0.95703125, + "learning_rate": 8.086612903225806e-05, + "loss": 0.2008, + "step": 12364 + }, + { + "epoch": 0.19784, + "grad_norm": 1.0703125, + "learning_rate": 8.086451612903226e-05, + "loss": 0.2191, + "step": 12365 + }, + { + "epoch": 0.197856, + "grad_norm": 1.0234375, + "learning_rate": 8.086290322580646e-05, + "loss": 0.1775, + "step": 12366 + }, + { + "epoch": 0.197872, + "grad_norm": 0.96875, + "learning_rate": 8.086129032258066e-05, + "loss": 0.1953, + "step": 12367 + }, + { + "epoch": 0.197888, + "grad_norm": 0.87890625, + "learning_rate": 8.085967741935484e-05, + "loss": 0.1836, + "step": 12368 + }, + { + "epoch": 0.197904, + "grad_norm": 0.828125, + "learning_rate": 8.085806451612904e-05, + "loss": 0.151, + "step": 12369 + }, + { + "epoch": 0.19792, + "grad_norm": 0.8359375, + "learning_rate": 8.085645161290323e-05, + "loss": 0.1506, + "step": 12370 + }, + { + "epoch": 0.197936, + "grad_norm": 0.8984375, + "learning_rate": 8.085483870967743e-05, + "loss": 0.1828, + "step": 12371 + }, + { + "epoch": 0.197952, + "grad_norm": 0.796875, + "learning_rate": 8.085322580645161e-05, + "loss": 0.1523, + "step": 12372 + }, + { + "epoch": 0.197968, + "grad_norm": 0.81640625, + "learning_rate": 8.085161290322581e-05, + "loss": 0.1667, + "step": 12373 + }, + { + "epoch": 0.197984, + "grad_norm": 0.6484375, + "learning_rate": 8.085e-05, + "loss": 0.1852, + "step": 12374 + }, + { + "epoch": 0.198, + "grad_norm": 0.8984375, + "learning_rate": 8.084838709677419e-05, + "loss": 0.2022, + "step": 12375 + }, + { + "epoch": 0.198016, + "grad_norm": 0.75, + "learning_rate": 8.084677419354839e-05, + "loss": 0.1514, + "step": 12376 + }, + { + "epoch": 0.198032, + "grad_norm": 0.6484375, + "learning_rate": 8.084516129032259e-05, + "loss": 0.1671, + "step": 12377 + }, + { + "epoch": 0.198048, + "grad_norm": 1.4453125, + "learning_rate": 8.084354838709679e-05, + "loss": 0.1934, + "step": 12378 + }, + { + "epoch": 0.198064, + "grad_norm": 0.58203125, + "learning_rate": 8.084193548387097e-05, + "loss": 0.1271, + "step": 12379 + }, + { + "epoch": 0.19808, + "grad_norm": 0.55078125, + "learning_rate": 8.084032258064517e-05, + "loss": 0.1527, + "step": 12380 + }, + { + "epoch": 0.198096, + "grad_norm": 1.0234375, + "learning_rate": 8.083870967741936e-05, + "loss": 0.2002, + "step": 12381 + }, + { + "epoch": 0.198112, + "grad_norm": 0.6875, + "learning_rate": 8.083709677419356e-05, + "loss": 0.1702, + "step": 12382 + }, + { + "epoch": 0.198128, + "grad_norm": 0.63671875, + "learning_rate": 8.083548387096774e-05, + "loss": 0.1585, + "step": 12383 + }, + { + "epoch": 0.198144, + "grad_norm": 0.67578125, + "learning_rate": 8.083387096774194e-05, + "loss": 0.178, + "step": 12384 + }, + { + "epoch": 0.19816, + "grad_norm": 0.8984375, + "learning_rate": 8.083225806451613e-05, + "loss": 0.215, + "step": 12385 + }, + { + "epoch": 0.198176, + "grad_norm": 0.9453125, + "learning_rate": 8.083064516129033e-05, + "loss": 0.1504, + "step": 12386 + }, + { + "epoch": 0.198192, + "grad_norm": 0.87890625, + "learning_rate": 8.082903225806451e-05, + "loss": 0.2135, + "step": 12387 + }, + { + "epoch": 0.198208, + "grad_norm": 0.5859375, + "learning_rate": 8.082741935483871e-05, + "loss": 0.1529, + "step": 12388 + }, + { + "epoch": 0.198224, + "grad_norm": 0.921875, + "learning_rate": 8.082580645161291e-05, + "loss": 0.1814, + "step": 12389 + }, + { + "epoch": 0.19824, + "grad_norm": 0.83984375, + "learning_rate": 8.08241935483871e-05, + "loss": 0.2156, + "step": 12390 + }, + { + "epoch": 0.198256, + "grad_norm": 0.58203125, + "learning_rate": 8.08225806451613e-05, + "loss": 0.1943, + "step": 12391 + }, + { + "epoch": 0.198272, + "grad_norm": 1.21875, + "learning_rate": 8.082096774193549e-05, + "loss": 0.163, + "step": 12392 + }, + { + "epoch": 0.198288, + "grad_norm": 0.83984375, + "learning_rate": 8.081935483870968e-05, + "loss": 0.1553, + "step": 12393 + }, + { + "epoch": 0.198304, + "grad_norm": 0.640625, + "learning_rate": 8.081774193548387e-05, + "loss": 0.1401, + "step": 12394 + }, + { + "epoch": 0.19832, + "grad_norm": 1.078125, + "learning_rate": 8.081612903225807e-05, + "loss": 0.1682, + "step": 12395 + }, + { + "epoch": 0.198336, + "grad_norm": 0.7578125, + "learning_rate": 8.081451612903226e-05, + "loss": 0.2463, + "step": 12396 + }, + { + "epoch": 0.198352, + "grad_norm": 0.89453125, + "learning_rate": 8.081290322580646e-05, + "loss": 0.1642, + "step": 12397 + }, + { + "epoch": 0.198368, + "grad_norm": 1.140625, + "learning_rate": 8.081129032258064e-05, + "loss": 0.2097, + "step": 12398 + }, + { + "epoch": 0.198384, + "grad_norm": 0.98046875, + "learning_rate": 8.080967741935484e-05, + "loss": 0.1985, + "step": 12399 + }, + { + "epoch": 0.1984, + "grad_norm": 1.453125, + "learning_rate": 8.080806451612903e-05, + "loss": 0.16, + "step": 12400 + }, + { + "epoch": 0.198416, + "grad_norm": 0.65625, + "learning_rate": 8.080645161290323e-05, + "loss": 0.1666, + "step": 12401 + }, + { + "epoch": 0.198432, + "grad_norm": 0.546875, + "learning_rate": 8.080483870967743e-05, + "loss": 0.1558, + "step": 12402 + }, + { + "epoch": 0.198448, + "grad_norm": 0.67578125, + "learning_rate": 8.080322580645163e-05, + "loss": 0.1691, + "step": 12403 + }, + { + "epoch": 0.198464, + "grad_norm": 0.85546875, + "learning_rate": 8.080161290322581e-05, + "loss": 0.2335, + "step": 12404 + }, + { + "epoch": 0.19848, + "grad_norm": 0.78515625, + "learning_rate": 8.080000000000001e-05, + "loss": 0.1657, + "step": 12405 + }, + { + "epoch": 0.198496, + "grad_norm": 0.75, + "learning_rate": 8.07983870967742e-05, + "loss": 0.153, + "step": 12406 + }, + { + "epoch": 0.198512, + "grad_norm": 0.72265625, + "learning_rate": 8.079677419354838e-05, + "loss": 0.1551, + "step": 12407 + }, + { + "epoch": 0.198528, + "grad_norm": 0.88671875, + "learning_rate": 8.079516129032258e-05, + "loss": 0.1983, + "step": 12408 + }, + { + "epoch": 0.198544, + "grad_norm": 0.6171875, + "learning_rate": 8.079354838709677e-05, + "loss": 0.1726, + "step": 12409 + }, + { + "epoch": 0.19856, + "grad_norm": 0.79296875, + "learning_rate": 8.079193548387097e-05, + "loss": 0.1792, + "step": 12410 + }, + { + "epoch": 0.198576, + "grad_norm": 0.67578125, + "learning_rate": 8.079032258064516e-05, + "loss": 0.162, + "step": 12411 + }, + { + "epoch": 0.198592, + "grad_norm": 0.80859375, + "learning_rate": 8.078870967741936e-05, + "loss": 0.1665, + "step": 12412 + }, + { + "epoch": 0.198608, + "grad_norm": 0.67578125, + "learning_rate": 8.078709677419356e-05, + "loss": 0.174, + "step": 12413 + }, + { + "epoch": 0.198624, + "grad_norm": 0.83203125, + "learning_rate": 8.078548387096775e-05, + "loss": 0.1937, + "step": 12414 + }, + { + "epoch": 0.19864, + "grad_norm": 0.94140625, + "learning_rate": 8.078387096774194e-05, + "loss": 0.118, + "step": 12415 + }, + { + "epoch": 0.198656, + "grad_norm": 0.79296875, + "learning_rate": 8.078225806451614e-05, + "loss": 0.1543, + "step": 12416 + }, + { + "epoch": 0.198672, + "grad_norm": 0.5234375, + "learning_rate": 8.078064516129033e-05, + "loss": 0.1772, + "step": 12417 + }, + { + "epoch": 0.198688, + "grad_norm": 0.8125, + "learning_rate": 8.077903225806453e-05, + "loss": 0.1878, + "step": 12418 + }, + { + "epoch": 0.198704, + "grad_norm": 1.3984375, + "learning_rate": 8.077741935483871e-05, + "loss": 0.1968, + "step": 12419 + }, + { + "epoch": 0.19872, + "grad_norm": 1.78125, + "learning_rate": 8.077580645161291e-05, + "loss": 0.2392, + "step": 12420 + }, + { + "epoch": 0.198736, + "grad_norm": 0.7109375, + "learning_rate": 8.07741935483871e-05, + "loss": 0.1731, + "step": 12421 + }, + { + "epoch": 0.198752, + "grad_norm": 0.63671875, + "learning_rate": 8.077258064516128e-05, + "loss": 0.1482, + "step": 12422 + }, + { + "epoch": 0.198768, + "grad_norm": 1.0625, + "learning_rate": 8.077096774193548e-05, + "loss": 0.1842, + "step": 12423 + }, + { + "epoch": 0.198784, + "grad_norm": 0.5625, + "learning_rate": 8.076935483870967e-05, + "loss": 0.1393, + "step": 12424 + }, + { + "epoch": 0.1988, + "grad_norm": 0.55859375, + "learning_rate": 8.076774193548387e-05, + "loss": 0.1427, + "step": 12425 + }, + { + "epoch": 0.198816, + "grad_norm": 1.5, + "learning_rate": 8.076612903225807e-05, + "loss": 0.2047, + "step": 12426 + }, + { + "epoch": 0.198832, + "grad_norm": 0.95703125, + "learning_rate": 8.076451612903227e-05, + "loss": 0.1759, + "step": 12427 + }, + { + "epoch": 0.198848, + "grad_norm": 0.84375, + "learning_rate": 8.076290322580645e-05, + "loss": 0.176, + "step": 12428 + }, + { + "epoch": 0.198864, + "grad_norm": 0.71875, + "learning_rate": 8.076129032258065e-05, + "loss": 0.1798, + "step": 12429 + }, + { + "epoch": 0.19888, + "grad_norm": 0.91796875, + "learning_rate": 8.075967741935484e-05, + "loss": 0.2376, + "step": 12430 + }, + { + "epoch": 0.198896, + "grad_norm": 0.71875, + "learning_rate": 8.075806451612904e-05, + "loss": 0.1672, + "step": 12431 + }, + { + "epoch": 0.198912, + "grad_norm": 0.9296875, + "learning_rate": 8.075645161290323e-05, + "loss": 0.2054, + "step": 12432 + }, + { + "epoch": 0.198928, + "grad_norm": 0.94140625, + "learning_rate": 8.075483870967743e-05, + "loss": 0.1528, + "step": 12433 + }, + { + "epoch": 0.198944, + "grad_norm": 0.9609375, + "learning_rate": 8.075322580645161e-05, + "loss": 0.1727, + "step": 12434 + }, + { + "epoch": 0.19896, + "grad_norm": 0.91796875, + "learning_rate": 8.075161290322581e-05, + "loss": 0.1748, + "step": 12435 + }, + { + "epoch": 0.198976, + "grad_norm": 0.89453125, + "learning_rate": 8.075e-05, + "loss": 0.1929, + "step": 12436 + }, + { + "epoch": 0.198992, + "grad_norm": 0.52734375, + "learning_rate": 8.07483870967742e-05, + "loss": 0.1611, + "step": 12437 + }, + { + "epoch": 0.199008, + "grad_norm": 0.8203125, + "learning_rate": 8.07467741935484e-05, + "loss": 0.1481, + "step": 12438 + }, + { + "epoch": 0.199024, + "grad_norm": 0.90234375, + "learning_rate": 8.074516129032258e-05, + "loss": 0.1661, + "step": 12439 + }, + { + "epoch": 0.19904, + "grad_norm": 0.578125, + "learning_rate": 8.074354838709678e-05, + "loss": 0.1793, + "step": 12440 + }, + { + "epoch": 0.199056, + "grad_norm": 1.3046875, + "learning_rate": 8.074193548387097e-05, + "loss": 0.2046, + "step": 12441 + }, + { + "epoch": 0.199072, + "grad_norm": 0.609375, + "learning_rate": 8.074032258064517e-05, + "loss": 0.1611, + "step": 12442 + }, + { + "epoch": 0.199088, + "grad_norm": 1.0546875, + "learning_rate": 8.073870967741935e-05, + "loss": 0.2131, + "step": 12443 + }, + { + "epoch": 0.199104, + "grad_norm": 0.53125, + "learning_rate": 8.073709677419355e-05, + "loss": 0.1778, + "step": 12444 + }, + { + "epoch": 0.19912, + "grad_norm": 0.765625, + "learning_rate": 8.073548387096774e-05, + "loss": 0.164, + "step": 12445 + }, + { + "epoch": 0.199136, + "grad_norm": 1.1640625, + "learning_rate": 8.073387096774194e-05, + "loss": 0.1845, + "step": 12446 + }, + { + "epoch": 0.199152, + "grad_norm": 0.61328125, + "learning_rate": 8.073225806451613e-05, + "loss": 0.191, + "step": 12447 + }, + { + "epoch": 0.199168, + "grad_norm": 0.80859375, + "learning_rate": 8.073064516129033e-05, + "loss": 0.1649, + "step": 12448 + }, + { + "epoch": 0.199184, + "grad_norm": 0.9453125, + "learning_rate": 8.072903225806453e-05, + "loss": 0.1864, + "step": 12449 + }, + { + "epoch": 0.1992, + "grad_norm": 1.015625, + "learning_rate": 8.072741935483872e-05, + "loss": 0.1497, + "step": 12450 + }, + { + "epoch": 0.199216, + "grad_norm": 1.1796875, + "learning_rate": 8.072580645161291e-05, + "loss": 0.1691, + "step": 12451 + }, + { + "epoch": 0.199232, + "grad_norm": 0.83203125, + "learning_rate": 8.072419354838711e-05, + "loss": 0.1921, + "step": 12452 + }, + { + "epoch": 0.199248, + "grad_norm": 0.78125, + "learning_rate": 8.07225806451613e-05, + "loss": 0.2133, + "step": 12453 + }, + { + "epoch": 0.199264, + "grad_norm": 1.8046875, + "learning_rate": 8.072096774193548e-05, + "loss": 0.201, + "step": 12454 + }, + { + "epoch": 0.19928, + "grad_norm": 0.99609375, + "learning_rate": 8.071935483870968e-05, + "loss": 0.1639, + "step": 12455 + }, + { + "epoch": 0.199296, + "grad_norm": 0.73828125, + "learning_rate": 8.071774193548387e-05, + "loss": 0.1556, + "step": 12456 + }, + { + "epoch": 0.199312, + "grad_norm": 1.0859375, + "learning_rate": 8.071612903225807e-05, + "loss": 0.2058, + "step": 12457 + }, + { + "epoch": 0.199328, + "grad_norm": 0.98828125, + "learning_rate": 8.071451612903225e-05, + "loss": 0.1519, + "step": 12458 + }, + { + "epoch": 0.199344, + "grad_norm": 0.59765625, + "learning_rate": 8.071290322580645e-05, + "loss": 0.1634, + "step": 12459 + }, + { + "epoch": 0.19936, + "grad_norm": 0.78515625, + "learning_rate": 8.071129032258064e-05, + "loss": 0.1815, + "step": 12460 + }, + { + "epoch": 0.199376, + "grad_norm": 0.9375, + "learning_rate": 8.070967741935484e-05, + "loss": 0.1712, + "step": 12461 + }, + { + "epoch": 0.199392, + "grad_norm": 0.84375, + "learning_rate": 8.070806451612904e-05, + "loss": 0.1745, + "step": 12462 + }, + { + "epoch": 0.199408, + "grad_norm": 1.375, + "learning_rate": 8.070645161290324e-05, + "loss": 0.2, + "step": 12463 + }, + { + "epoch": 0.199424, + "grad_norm": 0.62890625, + "learning_rate": 8.070483870967742e-05, + "loss": 0.1581, + "step": 12464 + }, + { + "epoch": 0.19944, + "grad_norm": 1.1953125, + "learning_rate": 8.070322580645162e-05, + "loss": 0.1978, + "step": 12465 + }, + { + "epoch": 0.199456, + "grad_norm": 1.328125, + "learning_rate": 8.070161290322581e-05, + "loss": 0.1818, + "step": 12466 + }, + { + "epoch": 0.199472, + "grad_norm": 1.484375, + "learning_rate": 8.070000000000001e-05, + "loss": 0.2174, + "step": 12467 + }, + { + "epoch": 0.199488, + "grad_norm": 0.80078125, + "learning_rate": 8.06983870967742e-05, + "loss": 0.1432, + "step": 12468 + }, + { + "epoch": 0.199504, + "grad_norm": 0.84375, + "learning_rate": 8.069677419354838e-05, + "loss": 0.1961, + "step": 12469 + }, + { + "epoch": 0.19952, + "grad_norm": 0.80078125, + "learning_rate": 8.069516129032258e-05, + "loss": 0.1874, + "step": 12470 + }, + { + "epoch": 0.199536, + "grad_norm": 0.6015625, + "learning_rate": 8.069354838709677e-05, + "loss": 0.1748, + "step": 12471 + }, + { + "epoch": 0.199552, + "grad_norm": 0.54296875, + "learning_rate": 8.069193548387097e-05, + "loss": 0.1597, + "step": 12472 + }, + { + "epoch": 0.199568, + "grad_norm": 0.65625, + "learning_rate": 8.069032258064517e-05, + "loss": 0.1632, + "step": 12473 + }, + { + "epoch": 0.199584, + "grad_norm": 0.62890625, + "learning_rate": 8.068870967741937e-05, + "loss": 0.1974, + "step": 12474 + }, + { + "epoch": 0.1996, + "grad_norm": 0.6953125, + "learning_rate": 8.068709677419355e-05, + "loss": 0.1933, + "step": 12475 + }, + { + "epoch": 0.199616, + "grad_norm": 1.1953125, + "learning_rate": 8.068548387096775e-05, + "loss": 0.1658, + "step": 12476 + }, + { + "epoch": 0.199632, + "grad_norm": 0.765625, + "learning_rate": 8.068387096774194e-05, + "loss": 0.1827, + "step": 12477 + }, + { + "epoch": 0.199648, + "grad_norm": 0.953125, + "learning_rate": 8.068225806451614e-05, + "loss": 0.1685, + "step": 12478 + }, + { + "epoch": 0.199664, + "grad_norm": 1.2109375, + "learning_rate": 8.068064516129032e-05, + "loss": 0.1899, + "step": 12479 + }, + { + "epoch": 0.19968, + "grad_norm": 1.1484375, + "learning_rate": 8.067903225806452e-05, + "loss": 0.1497, + "step": 12480 + }, + { + "epoch": 0.199696, + "grad_norm": 0.72265625, + "learning_rate": 8.067741935483871e-05, + "loss": 0.1454, + "step": 12481 + }, + { + "epoch": 0.199712, + "grad_norm": 0.70703125, + "learning_rate": 8.067580645161291e-05, + "loss": 0.1892, + "step": 12482 + }, + { + "epoch": 0.199728, + "grad_norm": 0.7421875, + "learning_rate": 8.06741935483871e-05, + "loss": 0.1426, + "step": 12483 + }, + { + "epoch": 0.199744, + "grad_norm": 0.66796875, + "learning_rate": 8.06725806451613e-05, + "loss": 0.145, + "step": 12484 + }, + { + "epoch": 0.19976, + "grad_norm": 0.95703125, + "learning_rate": 8.067096774193548e-05, + "loss": 0.1616, + "step": 12485 + }, + { + "epoch": 0.199776, + "grad_norm": 0.765625, + "learning_rate": 8.066935483870968e-05, + "loss": 0.2073, + "step": 12486 + }, + { + "epoch": 0.199792, + "grad_norm": 0.953125, + "learning_rate": 8.066774193548388e-05, + "loss": 0.1622, + "step": 12487 + }, + { + "epoch": 0.199808, + "grad_norm": 1.25, + "learning_rate": 8.066612903225807e-05, + "loss": 0.2179, + "step": 12488 + }, + { + "epoch": 0.199824, + "grad_norm": 0.7265625, + "learning_rate": 8.066451612903227e-05, + "loss": 0.1761, + "step": 12489 + }, + { + "epoch": 0.19984, + "grad_norm": 0.69140625, + "learning_rate": 8.066290322580645e-05, + "loss": 0.1627, + "step": 12490 + }, + { + "epoch": 0.199856, + "grad_norm": 1.078125, + "learning_rate": 8.066129032258065e-05, + "loss": 0.1985, + "step": 12491 + }, + { + "epoch": 0.199872, + "grad_norm": 0.62109375, + "learning_rate": 8.065967741935484e-05, + "loss": 0.1758, + "step": 12492 + }, + { + "epoch": 0.199888, + "grad_norm": 0.59765625, + "learning_rate": 8.065806451612904e-05, + "loss": 0.1423, + "step": 12493 + }, + { + "epoch": 0.199904, + "grad_norm": 0.83984375, + "learning_rate": 8.065645161290322e-05, + "loss": 0.1598, + "step": 12494 + }, + { + "epoch": 0.19992, + "grad_norm": 0.703125, + "learning_rate": 8.065483870967742e-05, + "loss": 0.1931, + "step": 12495 + }, + { + "epoch": 0.199936, + "grad_norm": 0.81640625, + "learning_rate": 8.065322580645161e-05, + "loss": 0.1859, + "step": 12496 + }, + { + "epoch": 0.199952, + "grad_norm": 0.609375, + "learning_rate": 8.065161290322581e-05, + "loss": 0.1598, + "step": 12497 + }, + { + "epoch": 0.199968, + "grad_norm": 0.6875, + "learning_rate": 8.065000000000001e-05, + "loss": 0.1531, + "step": 12498 + }, + { + "epoch": 0.199984, + "grad_norm": 0.67578125, + "learning_rate": 8.064838709677421e-05, + "loss": 0.1232, + "step": 12499 + }, + { + "epoch": 0.2, + "grad_norm": 0.62890625, + "learning_rate": 8.06467741935484e-05, + "loss": 0.1401, + "step": 12500 + }, + { + "epoch": 0.200016, + "grad_norm": 0.6640625, + "learning_rate": 8.064516129032258e-05, + "loss": 0.1151, + "step": 12501 + }, + { + "epoch": 0.200032, + "grad_norm": 0.66015625, + "learning_rate": 8.064354838709678e-05, + "loss": 0.1475, + "step": 12502 + }, + { + "epoch": 0.200048, + "grad_norm": 1.1640625, + "learning_rate": 8.064193548387097e-05, + "loss": 0.1784, + "step": 12503 + }, + { + "epoch": 0.200064, + "grad_norm": 0.625, + "learning_rate": 8.064032258064517e-05, + "loss": 0.1464, + "step": 12504 + }, + { + "epoch": 0.20008, + "grad_norm": 0.6953125, + "learning_rate": 8.063870967741935e-05, + "loss": 0.1558, + "step": 12505 + }, + { + "epoch": 0.200096, + "grad_norm": 0.62890625, + "learning_rate": 8.063709677419355e-05, + "loss": 0.1411, + "step": 12506 + }, + { + "epoch": 0.200112, + "grad_norm": 0.92578125, + "learning_rate": 8.063548387096774e-05, + "loss": 0.1357, + "step": 12507 + }, + { + "epoch": 0.200128, + "grad_norm": 0.703125, + "learning_rate": 8.063387096774194e-05, + "loss": 0.1656, + "step": 12508 + }, + { + "epoch": 0.200144, + "grad_norm": 1.0625, + "learning_rate": 8.063225806451614e-05, + "loss": 0.163, + "step": 12509 + }, + { + "epoch": 0.20016, + "grad_norm": 0.65234375, + "learning_rate": 8.063064516129034e-05, + "loss": 0.159, + "step": 12510 + }, + { + "epoch": 0.200176, + "grad_norm": 0.61328125, + "learning_rate": 8.062903225806452e-05, + "loss": 0.2026, + "step": 12511 + }, + { + "epoch": 0.200192, + "grad_norm": 0.93359375, + "learning_rate": 8.062741935483872e-05, + "loss": 0.2079, + "step": 12512 + }, + { + "epoch": 0.200208, + "grad_norm": 0.55078125, + "learning_rate": 8.062580645161291e-05, + "loss": 0.1699, + "step": 12513 + }, + { + "epoch": 0.200224, + "grad_norm": 1.0859375, + "learning_rate": 8.062419354838711e-05, + "loss": 0.178, + "step": 12514 + }, + { + "epoch": 0.20024, + "grad_norm": 0.6640625, + "learning_rate": 8.06225806451613e-05, + "loss": 0.1965, + "step": 12515 + }, + { + "epoch": 0.200256, + "grad_norm": 0.5234375, + "learning_rate": 8.062096774193548e-05, + "loss": 0.1549, + "step": 12516 + }, + { + "epoch": 0.200272, + "grad_norm": 0.80078125, + "learning_rate": 8.061935483870968e-05, + "loss": 0.159, + "step": 12517 + }, + { + "epoch": 0.200288, + "grad_norm": 0.96875, + "learning_rate": 8.061774193548387e-05, + "loss": 0.1235, + "step": 12518 + }, + { + "epoch": 0.200304, + "grad_norm": 0.875, + "learning_rate": 8.061612903225807e-05, + "loss": 0.1779, + "step": 12519 + }, + { + "epoch": 0.20032, + "grad_norm": 0.83984375, + "learning_rate": 8.061451612903225e-05, + "loss": 0.185, + "step": 12520 + }, + { + "epoch": 0.200336, + "grad_norm": 0.6171875, + "learning_rate": 8.061290322580645e-05, + "loss": 0.1697, + "step": 12521 + }, + { + "epoch": 0.200352, + "grad_norm": 0.6640625, + "learning_rate": 8.061129032258065e-05, + "loss": 0.143, + "step": 12522 + }, + { + "epoch": 0.200368, + "grad_norm": 0.7734375, + "learning_rate": 8.060967741935485e-05, + "loss": 0.1828, + "step": 12523 + }, + { + "epoch": 0.200384, + "grad_norm": 1.1015625, + "learning_rate": 8.060806451612904e-05, + "loss": 0.1846, + "step": 12524 + }, + { + "epoch": 0.2004, + "grad_norm": 0.71875, + "learning_rate": 8.060645161290324e-05, + "loss": 0.1564, + "step": 12525 + }, + { + "epoch": 0.200416, + "grad_norm": 0.90234375, + "learning_rate": 8.060483870967742e-05, + "loss": 0.1601, + "step": 12526 + }, + { + "epoch": 0.200432, + "grad_norm": 0.734375, + "learning_rate": 8.060322580645162e-05, + "loss": 0.2183, + "step": 12527 + }, + { + "epoch": 0.200448, + "grad_norm": 0.875, + "learning_rate": 8.060161290322581e-05, + "loss": 0.1987, + "step": 12528 + }, + { + "epoch": 0.200464, + "grad_norm": 0.73046875, + "learning_rate": 8.060000000000001e-05, + "loss": 0.1578, + "step": 12529 + }, + { + "epoch": 0.20048, + "grad_norm": 0.70703125, + "learning_rate": 8.05983870967742e-05, + "loss": 0.1732, + "step": 12530 + }, + { + "epoch": 0.200496, + "grad_norm": 1.0859375, + "learning_rate": 8.059677419354838e-05, + "loss": 0.1487, + "step": 12531 + }, + { + "epoch": 0.200512, + "grad_norm": 0.68359375, + "learning_rate": 8.059516129032258e-05, + "loss": 0.1657, + "step": 12532 + }, + { + "epoch": 0.200528, + "grad_norm": 1.2421875, + "learning_rate": 8.059354838709678e-05, + "loss": 0.16, + "step": 12533 + }, + { + "epoch": 0.200544, + "grad_norm": 0.98828125, + "learning_rate": 8.059193548387098e-05, + "loss": 0.1835, + "step": 12534 + }, + { + "epoch": 0.20056, + "grad_norm": 0.5234375, + "learning_rate": 8.059032258064516e-05, + "loss": 0.1322, + "step": 12535 + }, + { + "epoch": 0.200576, + "grad_norm": 0.73828125, + "learning_rate": 8.058870967741936e-05, + "loss": 0.1546, + "step": 12536 + }, + { + "epoch": 0.200592, + "grad_norm": 0.671875, + "learning_rate": 8.058709677419355e-05, + "loss": 0.1657, + "step": 12537 + }, + { + "epoch": 0.200608, + "grad_norm": 0.671875, + "learning_rate": 8.058548387096775e-05, + "loss": 0.1967, + "step": 12538 + }, + { + "epoch": 0.200624, + "grad_norm": 0.6640625, + "learning_rate": 8.058387096774194e-05, + "loss": 0.1454, + "step": 12539 + }, + { + "epoch": 0.20064, + "grad_norm": 1.0, + "learning_rate": 8.058225806451614e-05, + "loss": 0.2017, + "step": 12540 + }, + { + "epoch": 0.200656, + "grad_norm": 0.76171875, + "learning_rate": 8.058064516129032e-05, + "loss": 0.1495, + "step": 12541 + }, + { + "epoch": 0.200672, + "grad_norm": 0.796875, + "learning_rate": 8.057903225806452e-05, + "loss": 0.1659, + "step": 12542 + }, + { + "epoch": 0.200688, + "grad_norm": 1.3203125, + "learning_rate": 8.057741935483871e-05, + "loss": 0.2032, + "step": 12543 + }, + { + "epoch": 0.200704, + "grad_norm": 0.490234375, + "learning_rate": 8.057580645161291e-05, + "loss": 0.1578, + "step": 12544 + }, + { + "epoch": 0.20072, + "grad_norm": 0.9765625, + "learning_rate": 8.057419354838711e-05, + "loss": 0.2187, + "step": 12545 + }, + { + "epoch": 0.200736, + "grad_norm": 1.09375, + "learning_rate": 8.05725806451613e-05, + "loss": 0.1646, + "step": 12546 + }, + { + "epoch": 0.200752, + "grad_norm": 0.62890625, + "learning_rate": 8.057096774193549e-05, + "loss": 0.1799, + "step": 12547 + }, + { + "epoch": 0.200768, + "grad_norm": 1.296875, + "learning_rate": 8.056935483870968e-05, + "loss": 0.1675, + "step": 12548 + }, + { + "epoch": 0.200784, + "grad_norm": 0.9296875, + "learning_rate": 8.056774193548388e-05, + "loss": 0.1756, + "step": 12549 + }, + { + "epoch": 0.2008, + "grad_norm": 1.0, + "learning_rate": 8.056612903225806e-05, + "loss": 0.1555, + "step": 12550 + }, + { + "epoch": 0.200816, + "grad_norm": 0.6328125, + "learning_rate": 8.056451612903226e-05, + "loss": 0.1364, + "step": 12551 + }, + { + "epoch": 0.200832, + "grad_norm": 0.82421875, + "learning_rate": 8.056290322580645e-05, + "loss": 0.1849, + "step": 12552 + }, + { + "epoch": 0.200848, + "grad_norm": 1.0859375, + "learning_rate": 8.056129032258065e-05, + "loss": 0.1956, + "step": 12553 + }, + { + "epoch": 0.200864, + "grad_norm": 1.4609375, + "learning_rate": 8.055967741935484e-05, + "loss": 0.2034, + "step": 12554 + }, + { + "epoch": 0.20088, + "grad_norm": 1.515625, + "learning_rate": 8.055806451612904e-05, + "loss": 0.1775, + "step": 12555 + }, + { + "epoch": 0.200896, + "grad_norm": 1.21875, + "learning_rate": 8.055645161290322e-05, + "loss": 0.1602, + "step": 12556 + }, + { + "epoch": 0.200912, + "grad_norm": 0.80859375, + "learning_rate": 8.055483870967742e-05, + "loss": 0.177, + "step": 12557 + }, + { + "epoch": 0.200928, + "grad_norm": 0.75390625, + "learning_rate": 8.055322580645162e-05, + "loss": 0.1491, + "step": 12558 + }, + { + "epoch": 0.200944, + "grad_norm": 0.94140625, + "learning_rate": 8.055161290322582e-05, + "loss": 0.2127, + "step": 12559 + }, + { + "epoch": 0.20096, + "grad_norm": 1.125, + "learning_rate": 8.055e-05, + "loss": 0.176, + "step": 12560 + }, + { + "epoch": 0.200976, + "grad_norm": 0.76171875, + "learning_rate": 8.05483870967742e-05, + "loss": 0.1706, + "step": 12561 + }, + { + "epoch": 0.200992, + "grad_norm": 0.76953125, + "learning_rate": 8.054677419354839e-05, + "loss": 0.1622, + "step": 12562 + }, + { + "epoch": 0.201008, + "grad_norm": 0.62890625, + "learning_rate": 8.054516129032258e-05, + "loss": 0.1528, + "step": 12563 + }, + { + "epoch": 0.201024, + "grad_norm": 0.64453125, + "learning_rate": 8.054354838709678e-05, + "loss": 0.1746, + "step": 12564 + }, + { + "epoch": 0.20104, + "grad_norm": 0.63671875, + "learning_rate": 8.054193548387096e-05, + "loss": 0.179, + "step": 12565 + }, + { + "epoch": 0.201056, + "grad_norm": 0.79296875, + "learning_rate": 8.054032258064516e-05, + "loss": 0.2006, + "step": 12566 + }, + { + "epoch": 0.201072, + "grad_norm": 0.72265625, + "learning_rate": 8.053870967741935e-05, + "loss": 0.2142, + "step": 12567 + }, + { + "epoch": 0.201088, + "grad_norm": 0.70703125, + "learning_rate": 8.053709677419355e-05, + "loss": 0.1736, + "step": 12568 + }, + { + "epoch": 0.201104, + "grad_norm": 0.6640625, + "learning_rate": 8.053548387096775e-05, + "loss": 0.1884, + "step": 12569 + }, + { + "epoch": 0.20112, + "grad_norm": 1.0234375, + "learning_rate": 8.053387096774195e-05, + "loss": 0.1342, + "step": 12570 + }, + { + "epoch": 0.201136, + "grad_norm": 0.7578125, + "learning_rate": 8.053225806451613e-05, + "loss": 0.1432, + "step": 12571 + }, + { + "epoch": 0.201152, + "grad_norm": 0.609375, + "learning_rate": 8.053064516129033e-05, + "loss": 0.1878, + "step": 12572 + }, + { + "epoch": 0.201168, + "grad_norm": 0.65625, + "learning_rate": 8.052903225806452e-05, + "loss": 0.1658, + "step": 12573 + }, + { + "epoch": 0.201184, + "grad_norm": 0.58984375, + "learning_rate": 8.052741935483872e-05, + "loss": 0.1546, + "step": 12574 + }, + { + "epoch": 0.2012, + "grad_norm": 0.8046875, + "learning_rate": 8.05258064516129e-05, + "loss": 0.1525, + "step": 12575 + }, + { + "epoch": 0.201216, + "grad_norm": 0.6328125, + "learning_rate": 8.05241935483871e-05, + "loss": 0.1493, + "step": 12576 + }, + { + "epoch": 0.201232, + "grad_norm": 1.0390625, + "learning_rate": 8.052258064516129e-05, + "loss": 0.1749, + "step": 12577 + }, + { + "epoch": 0.201248, + "grad_norm": 1.0859375, + "learning_rate": 8.052096774193548e-05, + "loss": 0.1613, + "step": 12578 + }, + { + "epoch": 0.201264, + "grad_norm": 0.8359375, + "learning_rate": 8.051935483870968e-05, + "loss": 0.1294, + "step": 12579 + }, + { + "epoch": 0.20128, + "grad_norm": 0.9921875, + "learning_rate": 8.051774193548388e-05, + "loss": 0.1859, + "step": 12580 + }, + { + "epoch": 0.201296, + "grad_norm": 0.77734375, + "learning_rate": 8.051612903225806e-05, + "loss": 0.207, + "step": 12581 + }, + { + "epoch": 0.201312, + "grad_norm": 0.73046875, + "learning_rate": 8.051451612903226e-05, + "loss": 0.1575, + "step": 12582 + }, + { + "epoch": 0.201328, + "grad_norm": 0.70703125, + "learning_rate": 8.051290322580646e-05, + "loss": 0.1427, + "step": 12583 + }, + { + "epoch": 0.201344, + "grad_norm": 0.546875, + "learning_rate": 8.051129032258065e-05, + "loss": 0.117, + "step": 12584 + }, + { + "epoch": 0.20136, + "grad_norm": 0.84765625, + "learning_rate": 8.050967741935485e-05, + "loss": 0.1644, + "step": 12585 + }, + { + "epoch": 0.201376, + "grad_norm": 1.1875, + "learning_rate": 8.050806451612903e-05, + "loss": 0.1588, + "step": 12586 + }, + { + "epoch": 0.201392, + "grad_norm": 0.7265625, + "learning_rate": 8.050645161290323e-05, + "loss": 0.1696, + "step": 12587 + }, + { + "epoch": 0.201408, + "grad_norm": 0.87890625, + "learning_rate": 8.050483870967742e-05, + "loss": 0.2027, + "step": 12588 + }, + { + "epoch": 0.201424, + "grad_norm": 0.95703125, + "learning_rate": 8.050322580645162e-05, + "loss": 0.158, + "step": 12589 + }, + { + "epoch": 0.20144, + "grad_norm": 0.7578125, + "learning_rate": 8.05016129032258e-05, + "loss": 0.1419, + "step": 12590 + }, + { + "epoch": 0.201456, + "grad_norm": 0.51953125, + "learning_rate": 8.05e-05, + "loss": 0.1692, + "step": 12591 + }, + { + "epoch": 0.201472, + "grad_norm": 0.478515625, + "learning_rate": 8.049838709677419e-05, + "loss": 0.135, + "step": 12592 + }, + { + "epoch": 0.201488, + "grad_norm": 0.796875, + "learning_rate": 8.049677419354839e-05, + "loss": 0.1347, + "step": 12593 + }, + { + "epoch": 0.201504, + "grad_norm": 0.9765625, + "learning_rate": 8.049516129032259e-05, + "loss": 0.1646, + "step": 12594 + }, + { + "epoch": 0.20152, + "grad_norm": 0.5625, + "learning_rate": 8.049354838709678e-05, + "loss": 0.1656, + "step": 12595 + }, + { + "epoch": 0.201536, + "grad_norm": 0.6640625, + "learning_rate": 8.049193548387098e-05, + "loss": 0.1722, + "step": 12596 + }, + { + "epoch": 0.201552, + "grad_norm": 1.078125, + "learning_rate": 8.049032258064516e-05, + "loss": 0.1901, + "step": 12597 + }, + { + "epoch": 0.201568, + "grad_norm": 1.2734375, + "learning_rate": 8.048870967741936e-05, + "loss": 0.1839, + "step": 12598 + }, + { + "epoch": 0.201584, + "grad_norm": 0.59375, + "learning_rate": 8.048709677419355e-05, + "loss": 0.1336, + "step": 12599 + }, + { + "epoch": 0.2016, + "grad_norm": 0.48828125, + "learning_rate": 8.048548387096775e-05, + "loss": 0.1451, + "step": 12600 + }, + { + "epoch": 0.201616, + "grad_norm": 0.94140625, + "learning_rate": 8.048387096774193e-05, + "loss": 0.1964, + "step": 12601 + }, + { + "epoch": 0.201632, + "grad_norm": 0.66796875, + "learning_rate": 8.048225806451613e-05, + "loss": 0.1946, + "step": 12602 + }, + { + "epoch": 0.201648, + "grad_norm": 1.0390625, + "learning_rate": 8.048064516129032e-05, + "loss": 0.1614, + "step": 12603 + }, + { + "epoch": 0.201664, + "grad_norm": 0.91796875, + "learning_rate": 8.047903225806452e-05, + "loss": 0.202, + "step": 12604 + }, + { + "epoch": 0.20168, + "grad_norm": 0.7578125, + "learning_rate": 8.047741935483872e-05, + "loss": 0.19, + "step": 12605 + }, + { + "epoch": 0.201696, + "grad_norm": 1.078125, + "learning_rate": 8.047580645161292e-05, + "loss": 0.2026, + "step": 12606 + }, + { + "epoch": 0.201712, + "grad_norm": 0.60546875, + "learning_rate": 8.04741935483871e-05, + "loss": 0.2054, + "step": 12607 + }, + { + "epoch": 0.201728, + "grad_norm": 0.71875, + "learning_rate": 8.04725806451613e-05, + "loss": 0.195, + "step": 12608 + }, + { + "epoch": 0.201744, + "grad_norm": 0.74609375, + "learning_rate": 8.047096774193549e-05, + "loss": 0.1447, + "step": 12609 + }, + { + "epoch": 0.20176, + "grad_norm": 0.66015625, + "learning_rate": 8.046935483870968e-05, + "loss": 0.169, + "step": 12610 + }, + { + "epoch": 0.201776, + "grad_norm": 1.2421875, + "learning_rate": 8.046774193548388e-05, + "loss": 0.1705, + "step": 12611 + }, + { + "epoch": 0.201792, + "grad_norm": 0.8046875, + "learning_rate": 8.046612903225806e-05, + "loss": 0.1617, + "step": 12612 + }, + { + "epoch": 0.201808, + "grad_norm": 1.3125, + "learning_rate": 8.046451612903226e-05, + "loss": 0.1972, + "step": 12613 + }, + { + "epoch": 0.201824, + "grad_norm": 0.98828125, + "learning_rate": 8.046290322580645e-05, + "loss": 0.1567, + "step": 12614 + }, + { + "epoch": 0.20184, + "grad_norm": 0.64453125, + "learning_rate": 8.046129032258065e-05, + "loss": 0.1466, + "step": 12615 + }, + { + "epoch": 0.201856, + "grad_norm": 0.70703125, + "learning_rate": 8.045967741935483e-05, + "loss": 0.1901, + "step": 12616 + }, + { + "epoch": 0.201872, + "grad_norm": 0.7109375, + "learning_rate": 8.045806451612903e-05, + "loss": 0.178, + "step": 12617 + }, + { + "epoch": 0.201888, + "grad_norm": 0.6015625, + "learning_rate": 8.045645161290323e-05, + "loss": 0.153, + "step": 12618 + }, + { + "epoch": 0.201904, + "grad_norm": 0.828125, + "learning_rate": 8.045483870967743e-05, + "loss": 0.1943, + "step": 12619 + }, + { + "epoch": 0.20192, + "grad_norm": 0.734375, + "learning_rate": 8.045322580645162e-05, + "loss": 0.1874, + "step": 12620 + }, + { + "epoch": 0.201936, + "grad_norm": 0.7421875, + "learning_rate": 8.045161290322582e-05, + "loss": 0.1639, + "step": 12621 + }, + { + "epoch": 0.201952, + "grad_norm": 0.625, + "learning_rate": 8.045e-05, + "loss": 0.1977, + "step": 12622 + }, + { + "epoch": 0.201968, + "grad_norm": 0.63671875, + "learning_rate": 8.04483870967742e-05, + "loss": 0.1933, + "step": 12623 + }, + { + "epoch": 0.201984, + "grad_norm": 0.81640625, + "learning_rate": 8.044677419354839e-05, + "loss": 0.1522, + "step": 12624 + }, + { + "epoch": 0.202, + "grad_norm": 0.640625, + "learning_rate": 8.044516129032258e-05, + "loss": 0.1703, + "step": 12625 + }, + { + "epoch": 0.202016, + "grad_norm": 0.67578125, + "learning_rate": 8.044354838709678e-05, + "loss": 0.1366, + "step": 12626 + }, + { + "epoch": 0.202032, + "grad_norm": 0.546875, + "learning_rate": 8.044193548387096e-05, + "loss": 0.1814, + "step": 12627 + }, + { + "epoch": 0.202048, + "grad_norm": 0.671875, + "learning_rate": 8.044032258064516e-05, + "loss": 0.1729, + "step": 12628 + }, + { + "epoch": 0.202064, + "grad_norm": 1.046875, + "learning_rate": 8.043870967741936e-05, + "loss": 0.2214, + "step": 12629 + }, + { + "epoch": 0.20208, + "grad_norm": 0.671875, + "learning_rate": 8.043709677419356e-05, + "loss": 0.1704, + "step": 12630 + }, + { + "epoch": 0.202096, + "grad_norm": 0.6015625, + "learning_rate": 8.043548387096775e-05, + "loss": 0.1451, + "step": 12631 + }, + { + "epoch": 0.202112, + "grad_norm": 0.8046875, + "learning_rate": 8.043387096774195e-05, + "loss": 0.1518, + "step": 12632 + }, + { + "epoch": 0.202128, + "grad_norm": 0.6171875, + "learning_rate": 8.043225806451613e-05, + "loss": 0.1558, + "step": 12633 + }, + { + "epoch": 0.202144, + "grad_norm": 0.78515625, + "learning_rate": 8.043064516129033e-05, + "loss": 0.1665, + "step": 12634 + }, + { + "epoch": 0.20216, + "grad_norm": 0.9375, + "learning_rate": 8.042903225806452e-05, + "loss": 0.187, + "step": 12635 + }, + { + "epoch": 0.202176, + "grad_norm": 0.8046875, + "learning_rate": 8.042741935483872e-05, + "loss": 0.1963, + "step": 12636 + }, + { + "epoch": 0.202192, + "grad_norm": 0.765625, + "learning_rate": 8.04258064516129e-05, + "loss": 0.1584, + "step": 12637 + }, + { + "epoch": 0.202208, + "grad_norm": 0.71875, + "learning_rate": 8.04241935483871e-05, + "loss": 0.1637, + "step": 12638 + }, + { + "epoch": 0.202224, + "grad_norm": 0.67578125, + "learning_rate": 8.042258064516129e-05, + "loss": 0.1649, + "step": 12639 + }, + { + "epoch": 0.20224, + "grad_norm": 0.69921875, + "learning_rate": 8.042096774193549e-05, + "loss": 0.1794, + "step": 12640 + }, + { + "epoch": 0.202256, + "grad_norm": 0.8125, + "learning_rate": 8.041935483870969e-05, + "loss": 0.1852, + "step": 12641 + }, + { + "epoch": 0.202272, + "grad_norm": 0.64453125, + "learning_rate": 8.041774193548387e-05, + "loss": 0.1336, + "step": 12642 + }, + { + "epoch": 0.202288, + "grad_norm": 0.76953125, + "learning_rate": 8.041612903225807e-05, + "loss": 0.1564, + "step": 12643 + }, + { + "epoch": 0.202304, + "grad_norm": 0.5703125, + "learning_rate": 8.041451612903226e-05, + "loss": 0.1646, + "step": 12644 + }, + { + "epoch": 0.20232, + "grad_norm": 0.6328125, + "learning_rate": 8.041290322580646e-05, + "loss": 0.1583, + "step": 12645 + }, + { + "epoch": 0.202336, + "grad_norm": 1.1171875, + "learning_rate": 8.041129032258065e-05, + "loss": 0.1687, + "step": 12646 + }, + { + "epoch": 0.202352, + "grad_norm": 1.109375, + "learning_rate": 8.040967741935485e-05, + "loss": 0.2053, + "step": 12647 + }, + { + "epoch": 0.202368, + "grad_norm": 0.6875, + "learning_rate": 8.040806451612903e-05, + "loss": 0.1589, + "step": 12648 + }, + { + "epoch": 0.202384, + "grad_norm": 0.66015625, + "learning_rate": 8.040645161290323e-05, + "loss": 0.1694, + "step": 12649 + }, + { + "epoch": 0.2024, + "grad_norm": 0.61328125, + "learning_rate": 8.040483870967742e-05, + "loss": 0.177, + "step": 12650 + }, + { + "epoch": 0.202416, + "grad_norm": 0.8671875, + "learning_rate": 8.040322580645162e-05, + "loss": 0.216, + "step": 12651 + }, + { + "epoch": 0.202432, + "grad_norm": 0.83203125, + "learning_rate": 8.04016129032258e-05, + "loss": 0.1975, + "step": 12652 + }, + { + "epoch": 0.202448, + "grad_norm": 0.64453125, + "learning_rate": 8.04e-05, + "loss": 0.1629, + "step": 12653 + }, + { + "epoch": 0.202464, + "grad_norm": 1.171875, + "learning_rate": 8.03983870967742e-05, + "loss": 0.17, + "step": 12654 + }, + { + "epoch": 0.20248, + "grad_norm": 1.015625, + "learning_rate": 8.03967741935484e-05, + "loss": 0.1636, + "step": 12655 + }, + { + "epoch": 0.202496, + "grad_norm": 0.58984375, + "learning_rate": 8.039516129032259e-05, + "loss": 0.1563, + "step": 12656 + }, + { + "epoch": 0.202512, + "grad_norm": 0.78125, + "learning_rate": 8.039354838709677e-05, + "loss": 0.188, + "step": 12657 + }, + { + "epoch": 0.202528, + "grad_norm": 0.77734375, + "learning_rate": 8.039193548387097e-05, + "loss": 0.1713, + "step": 12658 + }, + { + "epoch": 0.202544, + "grad_norm": 0.83984375, + "learning_rate": 8.039032258064516e-05, + "loss": 0.1552, + "step": 12659 + }, + { + "epoch": 0.20256, + "grad_norm": 0.99609375, + "learning_rate": 8.038870967741936e-05, + "loss": 0.1784, + "step": 12660 + }, + { + "epoch": 0.202576, + "grad_norm": 0.96875, + "learning_rate": 8.038709677419355e-05, + "loss": 0.2002, + "step": 12661 + }, + { + "epoch": 0.202592, + "grad_norm": 0.93359375, + "learning_rate": 8.038548387096775e-05, + "loss": 0.175, + "step": 12662 + }, + { + "epoch": 0.202608, + "grad_norm": 1.0703125, + "learning_rate": 8.038387096774193e-05, + "loss": 0.1853, + "step": 12663 + }, + { + "epoch": 0.202624, + "grad_norm": 0.78515625, + "learning_rate": 8.038225806451613e-05, + "loss": 0.1727, + "step": 12664 + }, + { + "epoch": 0.20264, + "grad_norm": 0.8828125, + "learning_rate": 8.038064516129033e-05, + "loss": 0.1488, + "step": 12665 + }, + { + "epoch": 0.202656, + "grad_norm": 0.57421875, + "learning_rate": 8.037903225806453e-05, + "loss": 0.1599, + "step": 12666 + }, + { + "epoch": 0.202672, + "grad_norm": 0.462890625, + "learning_rate": 8.037741935483872e-05, + "loss": 0.1498, + "step": 12667 + }, + { + "epoch": 0.202688, + "grad_norm": 0.80078125, + "learning_rate": 8.037580645161292e-05, + "loss": 0.1879, + "step": 12668 + }, + { + "epoch": 0.202704, + "grad_norm": 0.88671875, + "learning_rate": 8.03741935483871e-05, + "loss": 0.193, + "step": 12669 + }, + { + "epoch": 0.20272, + "grad_norm": 0.921875, + "learning_rate": 8.03725806451613e-05, + "loss": 0.1643, + "step": 12670 + }, + { + "epoch": 0.202736, + "grad_norm": 0.83984375, + "learning_rate": 8.037096774193549e-05, + "loss": 0.1866, + "step": 12671 + }, + { + "epoch": 0.202752, + "grad_norm": 1.0546875, + "learning_rate": 8.036935483870967e-05, + "loss": 0.187, + "step": 12672 + }, + { + "epoch": 0.202768, + "grad_norm": 0.53515625, + "learning_rate": 8.036774193548387e-05, + "loss": 0.1616, + "step": 12673 + }, + { + "epoch": 0.202784, + "grad_norm": 0.61328125, + "learning_rate": 8.036612903225806e-05, + "loss": 0.1662, + "step": 12674 + }, + { + "epoch": 0.2028, + "grad_norm": 0.71484375, + "learning_rate": 8.036451612903226e-05, + "loss": 0.1678, + "step": 12675 + }, + { + "epoch": 0.202816, + "grad_norm": 0.64453125, + "learning_rate": 8.036290322580645e-05, + "loss": 0.1623, + "step": 12676 + }, + { + "epoch": 0.202832, + "grad_norm": 0.7890625, + "learning_rate": 8.036129032258064e-05, + "loss": 0.1662, + "step": 12677 + }, + { + "epoch": 0.202848, + "grad_norm": 0.66015625, + "learning_rate": 8.035967741935484e-05, + "loss": 0.1534, + "step": 12678 + }, + { + "epoch": 0.202864, + "grad_norm": 1.0625, + "learning_rate": 8.035806451612904e-05, + "loss": 0.1993, + "step": 12679 + }, + { + "epoch": 0.20288, + "grad_norm": 0.6875, + "learning_rate": 8.035645161290323e-05, + "loss": 0.1473, + "step": 12680 + }, + { + "epoch": 0.202896, + "grad_norm": 0.8671875, + "learning_rate": 8.035483870967743e-05, + "loss": 0.2039, + "step": 12681 + }, + { + "epoch": 0.202912, + "grad_norm": 0.99609375, + "learning_rate": 8.035322580645162e-05, + "loss": 0.1806, + "step": 12682 + }, + { + "epoch": 0.202928, + "grad_norm": 0.75, + "learning_rate": 8.035161290322582e-05, + "loss": 0.1455, + "step": 12683 + }, + { + "epoch": 0.202944, + "grad_norm": 0.78515625, + "learning_rate": 8.035e-05, + "loss": 0.1893, + "step": 12684 + }, + { + "epoch": 0.20296, + "grad_norm": 1.2109375, + "learning_rate": 8.03483870967742e-05, + "loss": 0.1694, + "step": 12685 + }, + { + "epoch": 0.202976, + "grad_norm": 0.7890625, + "learning_rate": 8.034677419354839e-05, + "loss": 0.1612, + "step": 12686 + }, + { + "epoch": 0.202992, + "grad_norm": 0.63671875, + "learning_rate": 8.034516129032257e-05, + "loss": 0.1507, + "step": 12687 + }, + { + "epoch": 0.203008, + "grad_norm": 0.875, + "learning_rate": 8.034354838709677e-05, + "loss": 0.175, + "step": 12688 + }, + { + "epoch": 0.203024, + "grad_norm": 0.7734375, + "learning_rate": 8.034193548387097e-05, + "loss": 0.2115, + "step": 12689 + }, + { + "epoch": 0.20304, + "grad_norm": 0.59375, + "learning_rate": 8.034032258064517e-05, + "loss": 0.1553, + "step": 12690 + }, + { + "epoch": 0.203056, + "grad_norm": 0.65625, + "learning_rate": 8.033870967741936e-05, + "loss": 0.1666, + "step": 12691 + }, + { + "epoch": 0.203072, + "grad_norm": 1.1015625, + "learning_rate": 8.033709677419356e-05, + "loss": 0.1458, + "step": 12692 + }, + { + "epoch": 0.203088, + "grad_norm": 0.66796875, + "learning_rate": 8.033548387096774e-05, + "loss": 0.1236, + "step": 12693 + }, + { + "epoch": 0.203104, + "grad_norm": 1.1953125, + "learning_rate": 8.033387096774194e-05, + "loss": 0.2036, + "step": 12694 + }, + { + "epoch": 0.20312, + "grad_norm": 1.34375, + "learning_rate": 8.033225806451613e-05, + "loss": 0.1949, + "step": 12695 + }, + { + "epoch": 0.203136, + "grad_norm": 1.1640625, + "learning_rate": 8.033064516129033e-05, + "loss": 0.1727, + "step": 12696 + }, + { + "epoch": 0.203152, + "grad_norm": 0.71484375, + "learning_rate": 8.032903225806452e-05, + "loss": 0.1812, + "step": 12697 + }, + { + "epoch": 0.203168, + "grad_norm": 0.79296875, + "learning_rate": 8.032741935483871e-05, + "loss": 0.2003, + "step": 12698 + }, + { + "epoch": 0.203184, + "grad_norm": 0.55078125, + "learning_rate": 8.03258064516129e-05, + "loss": 0.1565, + "step": 12699 + }, + { + "epoch": 0.2032, + "grad_norm": 0.81640625, + "learning_rate": 8.03241935483871e-05, + "loss": 0.186, + "step": 12700 + }, + { + "epoch": 0.203216, + "grad_norm": 0.6953125, + "learning_rate": 8.03225806451613e-05, + "loss": 0.1816, + "step": 12701 + }, + { + "epoch": 0.203232, + "grad_norm": 0.8359375, + "learning_rate": 8.032096774193549e-05, + "loss": 0.165, + "step": 12702 + }, + { + "epoch": 0.203248, + "grad_norm": 0.62890625, + "learning_rate": 8.031935483870969e-05, + "loss": 0.1588, + "step": 12703 + }, + { + "epoch": 0.203264, + "grad_norm": 0.734375, + "learning_rate": 8.031774193548387e-05, + "loss": 0.1875, + "step": 12704 + }, + { + "epoch": 0.20328, + "grad_norm": 0.8046875, + "learning_rate": 8.031612903225807e-05, + "loss": 0.1626, + "step": 12705 + }, + { + "epoch": 0.203296, + "grad_norm": 0.82421875, + "learning_rate": 8.031451612903226e-05, + "loss": 0.1612, + "step": 12706 + }, + { + "epoch": 0.203312, + "grad_norm": 0.7890625, + "learning_rate": 8.031290322580646e-05, + "loss": 0.187, + "step": 12707 + }, + { + "epoch": 0.203328, + "grad_norm": 0.67578125, + "learning_rate": 8.031129032258064e-05, + "loss": 0.1554, + "step": 12708 + }, + { + "epoch": 0.203344, + "grad_norm": 0.7265625, + "learning_rate": 8.030967741935484e-05, + "loss": 0.1244, + "step": 12709 + }, + { + "epoch": 0.20336, + "grad_norm": 1.171875, + "learning_rate": 8.030806451612903e-05, + "loss": 0.1863, + "step": 12710 + }, + { + "epoch": 0.203376, + "grad_norm": 0.9375, + "learning_rate": 8.030645161290323e-05, + "loss": 0.1735, + "step": 12711 + }, + { + "epoch": 0.203392, + "grad_norm": 1.1171875, + "learning_rate": 8.030483870967741e-05, + "loss": 0.2022, + "step": 12712 + }, + { + "epoch": 0.203408, + "grad_norm": 1.0859375, + "learning_rate": 8.030322580645161e-05, + "loss": 0.1609, + "step": 12713 + }, + { + "epoch": 0.203424, + "grad_norm": 0.72265625, + "learning_rate": 8.030161290322581e-05, + "loss": 0.1808, + "step": 12714 + }, + { + "epoch": 0.20344, + "grad_norm": 0.7578125, + "learning_rate": 8.030000000000001e-05, + "loss": 0.1895, + "step": 12715 + }, + { + "epoch": 0.203456, + "grad_norm": 0.6484375, + "learning_rate": 8.02983870967742e-05, + "loss": 0.1827, + "step": 12716 + }, + { + "epoch": 0.203472, + "grad_norm": 0.6484375, + "learning_rate": 8.02967741935484e-05, + "loss": 0.177, + "step": 12717 + }, + { + "epoch": 0.203488, + "grad_norm": 0.55859375, + "learning_rate": 8.029516129032259e-05, + "loss": 0.1724, + "step": 12718 + }, + { + "epoch": 0.203504, + "grad_norm": 0.609375, + "learning_rate": 8.029354838709677e-05, + "loss": 0.1667, + "step": 12719 + }, + { + "epoch": 0.20352, + "grad_norm": 1.9453125, + "learning_rate": 8.029193548387097e-05, + "loss": 0.1923, + "step": 12720 + }, + { + "epoch": 0.203536, + "grad_norm": 0.75390625, + "learning_rate": 8.029032258064516e-05, + "loss": 0.1529, + "step": 12721 + }, + { + "epoch": 0.203552, + "grad_norm": 0.97265625, + "learning_rate": 8.028870967741936e-05, + "loss": 0.1749, + "step": 12722 + }, + { + "epoch": 0.203568, + "grad_norm": 1.0078125, + "learning_rate": 8.028709677419354e-05, + "loss": 0.1619, + "step": 12723 + }, + { + "epoch": 0.203584, + "grad_norm": 0.455078125, + "learning_rate": 8.028548387096774e-05, + "loss": 0.159, + "step": 12724 + }, + { + "epoch": 0.2036, + "grad_norm": 1.578125, + "learning_rate": 8.028387096774194e-05, + "loss": 0.1619, + "step": 12725 + }, + { + "epoch": 0.203616, + "grad_norm": 0.76953125, + "learning_rate": 8.028225806451614e-05, + "loss": 0.1652, + "step": 12726 + }, + { + "epoch": 0.203632, + "grad_norm": 0.75390625, + "learning_rate": 8.028064516129033e-05, + "loss": 0.1801, + "step": 12727 + }, + { + "epoch": 0.203648, + "grad_norm": 0.8984375, + "learning_rate": 8.027903225806453e-05, + "loss": 0.1406, + "step": 12728 + }, + { + "epoch": 0.203664, + "grad_norm": 1.0546875, + "learning_rate": 8.027741935483871e-05, + "loss": 0.2058, + "step": 12729 + }, + { + "epoch": 0.20368, + "grad_norm": 0.6953125, + "learning_rate": 8.027580645161291e-05, + "loss": 0.189, + "step": 12730 + }, + { + "epoch": 0.203696, + "grad_norm": 0.796875, + "learning_rate": 8.02741935483871e-05, + "loss": 0.1661, + "step": 12731 + }, + { + "epoch": 0.203712, + "grad_norm": 0.6171875, + "learning_rate": 8.02725806451613e-05, + "loss": 0.1434, + "step": 12732 + }, + { + "epoch": 0.203728, + "grad_norm": 0.6015625, + "learning_rate": 8.027096774193549e-05, + "loss": 0.1419, + "step": 12733 + }, + { + "epoch": 0.203744, + "grad_norm": 0.5859375, + "learning_rate": 8.026935483870967e-05, + "loss": 0.165, + "step": 12734 + }, + { + "epoch": 0.20376, + "grad_norm": 0.57421875, + "learning_rate": 8.026774193548387e-05, + "loss": 0.1448, + "step": 12735 + }, + { + "epoch": 0.203776, + "grad_norm": 0.609375, + "learning_rate": 8.026612903225807e-05, + "loss": 0.1686, + "step": 12736 + }, + { + "epoch": 0.203792, + "grad_norm": 0.70703125, + "learning_rate": 8.026451612903226e-05, + "loss": 0.1521, + "step": 12737 + }, + { + "epoch": 0.203808, + "grad_norm": 0.5859375, + "learning_rate": 8.026290322580646e-05, + "loss": 0.1572, + "step": 12738 + }, + { + "epoch": 0.203824, + "grad_norm": 0.65625, + "learning_rate": 8.026129032258066e-05, + "loss": 0.1154, + "step": 12739 + }, + { + "epoch": 0.20384, + "grad_norm": 0.74609375, + "learning_rate": 8.025967741935484e-05, + "loss": 0.1964, + "step": 12740 + }, + { + "epoch": 0.203856, + "grad_norm": 0.875, + "learning_rate": 8.025806451612904e-05, + "loss": 0.174, + "step": 12741 + }, + { + "epoch": 0.203872, + "grad_norm": 0.55859375, + "learning_rate": 8.025645161290323e-05, + "loss": 0.1504, + "step": 12742 + }, + { + "epoch": 0.203888, + "grad_norm": 0.68359375, + "learning_rate": 8.025483870967743e-05, + "loss": 0.1446, + "step": 12743 + }, + { + "epoch": 0.203904, + "grad_norm": 0.9765625, + "learning_rate": 8.025322580645161e-05, + "loss": 0.1658, + "step": 12744 + }, + { + "epoch": 0.20392, + "grad_norm": 0.64453125, + "learning_rate": 8.025161290322581e-05, + "loss": 0.1726, + "step": 12745 + }, + { + "epoch": 0.203936, + "grad_norm": 1.0625, + "learning_rate": 8.025e-05, + "loss": 0.1699, + "step": 12746 + }, + { + "epoch": 0.203952, + "grad_norm": 0.71875, + "learning_rate": 8.02483870967742e-05, + "loss": 0.1826, + "step": 12747 + }, + { + "epoch": 0.203968, + "grad_norm": 0.81640625, + "learning_rate": 8.024677419354838e-05, + "loss": 0.1864, + "step": 12748 + }, + { + "epoch": 0.203984, + "grad_norm": 0.77734375, + "learning_rate": 8.024516129032258e-05, + "loss": 0.1721, + "step": 12749 + }, + { + "epoch": 0.204, + "grad_norm": 0.765625, + "learning_rate": 8.024354838709678e-05, + "loss": 0.1465, + "step": 12750 + }, + { + "epoch": 0.204016, + "grad_norm": 0.76171875, + "learning_rate": 8.024193548387097e-05, + "loss": 0.1429, + "step": 12751 + }, + { + "epoch": 0.204032, + "grad_norm": 0.875, + "learning_rate": 8.024032258064517e-05, + "loss": 0.1648, + "step": 12752 + }, + { + "epoch": 0.204048, + "grad_norm": 1.3359375, + "learning_rate": 8.023870967741936e-05, + "loss": 0.1822, + "step": 12753 + }, + { + "epoch": 0.204064, + "grad_norm": 0.62890625, + "learning_rate": 8.023709677419356e-05, + "loss": 0.1632, + "step": 12754 + }, + { + "epoch": 0.20408, + "grad_norm": 0.76953125, + "learning_rate": 8.023548387096774e-05, + "loss": 0.1676, + "step": 12755 + }, + { + "epoch": 0.204096, + "grad_norm": 0.6484375, + "learning_rate": 8.023387096774194e-05, + "loss": 0.1637, + "step": 12756 + }, + { + "epoch": 0.204112, + "grad_norm": 0.78515625, + "learning_rate": 8.023225806451613e-05, + "loss": 0.2016, + "step": 12757 + }, + { + "epoch": 0.204128, + "grad_norm": 1.1015625, + "learning_rate": 8.023064516129033e-05, + "loss": 0.2055, + "step": 12758 + }, + { + "epoch": 0.204144, + "grad_norm": 0.91796875, + "learning_rate": 8.022903225806451e-05, + "loss": 0.1437, + "step": 12759 + }, + { + "epoch": 0.20416, + "grad_norm": 0.734375, + "learning_rate": 8.022741935483871e-05, + "loss": 0.1578, + "step": 12760 + }, + { + "epoch": 0.204176, + "grad_norm": 0.6953125, + "learning_rate": 8.022580645161291e-05, + "loss": 0.1961, + "step": 12761 + }, + { + "epoch": 0.204192, + "grad_norm": 0.98046875, + "learning_rate": 8.022419354838711e-05, + "loss": 0.1177, + "step": 12762 + }, + { + "epoch": 0.204208, + "grad_norm": 1.140625, + "learning_rate": 8.02225806451613e-05, + "loss": 0.2154, + "step": 12763 + }, + { + "epoch": 0.204224, + "grad_norm": 1.0546875, + "learning_rate": 8.02209677419355e-05, + "loss": 0.2099, + "step": 12764 + }, + { + "epoch": 0.20424, + "grad_norm": 1.3046875, + "learning_rate": 8.021935483870968e-05, + "loss": 0.1886, + "step": 12765 + }, + { + "epoch": 0.204256, + "grad_norm": 0.69921875, + "learning_rate": 8.021774193548387e-05, + "loss": 0.187, + "step": 12766 + }, + { + "epoch": 0.204272, + "grad_norm": 0.765625, + "learning_rate": 8.021612903225807e-05, + "loss": 0.145, + "step": 12767 + }, + { + "epoch": 0.204288, + "grad_norm": 1.359375, + "learning_rate": 8.021451612903226e-05, + "loss": 0.2227, + "step": 12768 + }, + { + "epoch": 0.204304, + "grad_norm": 0.7578125, + "learning_rate": 8.021290322580646e-05, + "loss": 0.1529, + "step": 12769 + }, + { + "epoch": 0.20432, + "grad_norm": 0.77734375, + "learning_rate": 8.021129032258064e-05, + "loss": 0.2326, + "step": 12770 + }, + { + "epoch": 0.204336, + "grad_norm": 0.9375, + "learning_rate": 8.020967741935484e-05, + "loss": 0.2202, + "step": 12771 + }, + { + "epoch": 0.204352, + "grad_norm": 1.296875, + "learning_rate": 8.020806451612903e-05, + "loss": 0.173, + "step": 12772 + }, + { + "epoch": 0.204368, + "grad_norm": 0.546875, + "learning_rate": 8.020645161290323e-05, + "loss": 0.1749, + "step": 12773 + }, + { + "epoch": 0.204384, + "grad_norm": 1.125, + "learning_rate": 8.020483870967743e-05, + "loss": 0.1595, + "step": 12774 + }, + { + "epoch": 0.2044, + "grad_norm": 0.828125, + "learning_rate": 8.020322580645163e-05, + "loss": 0.1838, + "step": 12775 + }, + { + "epoch": 0.204416, + "grad_norm": 0.7734375, + "learning_rate": 8.020161290322581e-05, + "loss": 0.1759, + "step": 12776 + }, + { + "epoch": 0.204432, + "grad_norm": 0.69921875, + "learning_rate": 8.020000000000001e-05, + "loss": 0.1805, + "step": 12777 + }, + { + "epoch": 0.204448, + "grad_norm": 0.5625, + "learning_rate": 8.01983870967742e-05, + "loss": 0.1877, + "step": 12778 + }, + { + "epoch": 0.204464, + "grad_norm": 0.625, + "learning_rate": 8.01967741935484e-05, + "loss": 0.1705, + "step": 12779 + }, + { + "epoch": 0.20448, + "grad_norm": 0.67578125, + "learning_rate": 8.019516129032258e-05, + "loss": 0.1701, + "step": 12780 + }, + { + "epoch": 0.204496, + "grad_norm": 0.50390625, + "learning_rate": 8.019354838709677e-05, + "loss": 0.1625, + "step": 12781 + }, + { + "epoch": 0.204512, + "grad_norm": 1.046875, + "learning_rate": 8.019193548387097e-05, + "loss": 0.1849, + "step": 12782 + }, + { + "epoch": 0.204528, + "grad_norm": 0.84765625, + "learning_rate": 8.019032258064515e-05, + "loss": 0.1491, + "step": 12783 + }, + { + "epoch": 0.204544, + "grad_norm": 0.77734375, + "learning_rate": 8.018870967741935e-05, + "loss": 0.145, + "step": 12784 + }, + { + "epoch": 0.20456, + "grad_norm": 1.0625, + "learning_rate": 8.018709677419355e-05, + "loss": 0.2006, + "step": 12785 + }, + { + "epoch": 0.204576, + "grad_norm": 0.80078125, + "learning_rate": 8.018548387096775e-05, + "loss": 0.216, + "step": 12786 + }, + { + "epoch": 0.204592, + "grad_norm": 1.8125, + "learning_rate": 8.018387096774194e-05, + "loss": 0.1993, + "step": 12787 + }, + { + "epoch": 0.204608, + "grad_norm": 2.21875, + "learning_rate": 8.018225806451614e-05, + "loss": 0.198, + "step": 12788 + }, + { + "epoch": 0.204624, + "grad_norm": 0.58984375, + "learning_rate": 8.018064516129033e-05, + "loss": 0.1512, + "step": 12789 + }, + { + "epoch": 0.20464, + "grad_norm": 0.87109375, + "learning_rate": 8.017903225806453e-05, + "loss": 0.1415, + "step": 12790 + }, + { + "epoch": 0.204656, + "grad_norm": 0.71875, + "learning_rate": 8.017741935483871e-05, + "loss": 0.1829, + "step": 12791 + }, + { + "epoch": 0.204672, + "grad_norm": 1.3046875, + "learning_rate": 8.017580645161291e-05, + "loss": 0.1797, + "step": 12792 + }, + { + "epoch": 0.204688, + "grad_norm": 0.94140625, + "learning_rate": 8.01741935483871e-05, + "loss": 0.2679, + "step": 12793 + }, + { + "epoch": 0.204704, + "grad_norm": 0.640625, + "learning_rate": 8.01725806451613e-05, + "loss": 0.1528, + "step": 12794 + }, + { + "epoch": 0.20472, + "grad_norm": 0.69921875, + "learning_rate": 8.017096774193548e-05, + "loss": 0.166, + "step": 12795 + }, + { + "epoch": 0.204736, + "grad_norm": 0.796875, + "learning_rate": 8.016935483870968e-05, + "loss": 0.2063, + "step": 12796 + }, + { + "epoch": 0.204752, + "grad_norm": 0.9140625, + "learning_rate": 8.016774193548388e-05, + "loss": 0.194, + "step": 12797 + }, + { + "epoch": 0.204768, + "grad_norm": 0.890625, + "learning_rate": 8.016612903225807e-05, + "loss": 0.2042, + "step": 12798 + }, + { + "epoch": 0.204784, + "grad_norm": 0.70703125, + "learning_rate": 8.016451612903227e-05, + "loss": 0.1551, + "step": 12799 + }, + { + "epoch": 0.2048, + "grad_norm": 0.671875, + "learning_rate": 8.016290322580645e-05, + "loss": 0.1683, + "step": 12800 + }, + { + "epoch": 0.204816, + "grad_norm": 0.6328125, + "learning_rate": 8.016129032258065e-05, + "loss": 0.1867, + "step": 12801 + }, + { + "epoch": 0.204832, + "grad_norm": 0.7578125, + "learning_rate": 8.015967741935484e-05, + "loss": 0.1973, + "step": 12802 + }, + { + "epoch": 0.204848, + "grad_norm": 0.7578125, + "learning_rate": 8.015806451612904e-05, + "loss": 0.1457, + "step": 12803 + }, + { + "epoch": 0.204864, + "grad_norm": 0.60546875, + "learning_rate": 8.015645161290323e-05, + "loss": 0.1833, + "step": 12804 + }, + { + "epoch": 0.20488, + "grad_norm": 0.5859375, + "learning_rate": 8.015483870967742e-05, + "loss": 0.1661, + "step": 12805 + }, + { + "epoch": 0.204896, + "grad_norm": 0.52734375, + "learning_rate": 8.015322580645161e-05, + "loss": 0.1617, + "step": 12806 + }, + { + "epoch": 0.204912, + "grad_norm": 0.86328125, + "learning_rate": 8.015161290322581e-05, + "loss": 0.2176, + "step": 12807 + }, + { + "epoch": 0.204928, + "grad_norm": 0.6953125, + "learning_rate": 8.015e-05, + "loss": 0.1848, + "step": 12808 + }, + { + "epoch": 0.204944, + "grad_norm": 0.7890625, + "learning_rate": 8.01483870967742e-05, + "loss": 0.2047, + "step": 12809 + }, + { + "epoch": 0.20496, + "grad_norm": 0.63671875, + "learning_rate": 8.01467741935484e-05, + "loss": 0.1873, + "step": 12810 + }, + { + "epoch": 0.204976, + "grad_norm": 0.90625, + "learning_rate": 8.014516129032258e-05, + "loss": 0.221, + "step": 12811 + }, + { + "epoch": 0.204992, + "grad_norm": 0.890625, + "learning_rate": 8.014354838709678e-05, + "loss": 0.1996, + "step": 12812 + }, + { + "epoch": 0.205008, + "grad_norm": 0.8984375, + "learning_rate": 8.014193548387097e-05, + "loss": 0.1875, + "step": 12813 + }, + { + "epoch": 0.205024, + "grad_norm": 0.6484375, + "learning_rate": 8.014032258064517e-05, + "loss": 0.142, + "step": 12814 + }, + { + "epoch": 0.20504, + "grad_norm": 0.75390625, + "learning_rate": 8.013870967741935e-05, + "loss": 0.1473, + "step": 12815 + }, + { + "epoch": 0.205056, + "grad_norm": 0.51171875, + "learning_rate": 8.013709677419355e-05, + "loss": 0.1455, + "step": 12816 + }, + { + "epoch": 0.205072, + "grad_norm": 0.671875, + "learning_rate": 8.013548387096774e-05, + "loss": 0.1046, + "step": 12817 + }, + { + "epoch": 0.205088, + "grad_norm": 0.6953125, + "learning_rate": 8.013387096774194e-05, + "loss": 0.1948, + "step": 12818 + }, + { + "epoch": 0.205104, + "grad_norm": 0.6875, + "learning_rate": 8.013225806451612e-05, + "loss": 0.1553, + "step": 12819 + }, + { + "epoch": 0.20512, + "grad_norm": 0.62109375, + "learning_rate": 8.013064516129032e-05, + "loss": 0.1366, + "step": 12820 + }, + { + "epoch": 0.205136, + "grad_norm": 0.75390625, + "learning_rate": 8.012903225806452e-05, + "loss": 0.1841, + "step": 12821 + }, + { + "epoch": 0.205152, + "grad_norm": 0.609375, + "learning_rate": 8.012741935483872e-05, + "loss": 0.1651, + "step": 12822 + }, + { + "epoch": 0.205168, + "grad_norm": 0.97265625, + "learning_rate": 8.012580645161291e-05, + "loss": 0.1756, + "step": 12823 + }, + { + "epoch": 0.205184, + "grad_norm": 0.95703125, + "learning_rate": 8.012419354838711e-05, + "loss": 0.1473, + "step": 12824 + }, + { + "epoch": 0.2052, + "grad_norm": 0.71875, + "learning_rate": 8.01225806451613e-05, + "loss": 0.1861, + "step": 12825 + }, + { + "epoch": 0.205216, + "grad_norm": 0.46484375, + "learning_rate": 8.01209677419355e-05, + "loss": 0.1236, + "step": 12826 + }, + { + "epoch": 0.205232, + "grad_norm": 1.1796875, + "learning_rate": 8.011935483870968e-05, + "loss": 0.2089, + "step": 12827 + }, + { + "epoch": 0.205248, + "grad_norm": 1.0, + "learning_rate": 8.011774193548387e-05, + "loss": 0.1782, + "step": 12828 + }, + { + "epoch": 0.205264, + "grad_norm": 0.87109375, + "learning_rate": 8.011612903225807e-05, + "loss": 0.183, + "step": 12829 + }, + { + "epoch": 0.20528, + "grad_norm": 0.85546875, + "learning_rate": 8.011451612903225e-05, + "loss": 0.2111, + "step": 12830 + }, + { + "epoch": 0.205296, + "grad_norm": 0.578125, + "learning_rate": 8.011290322580645e-05, + "loss": 0.1411, + "step": 12831 + }, + { + "epoch": 0.205312, + "grad_norm": 0.859375, + "learning_rate": 8.011129032258065e-05, + "loss": 0.1761, + "step": 12832 + }, + { + "epoch": 0.205328, + "grad_norm": 0.80078125, + "learning_rate": 8.010967741935484e-05, + "loss": 0.1836, + "step": 12833 + }, + { + "epoch": 0.205344, + "grad_norm": 0.734375, + "learning_rate": 8.010806451612904e-05, + "loss": 0.1379, + "step": 12834 + }, + { + "epoch": 0.20536, + "grad_norm": 1.109375, + "learning_rate": 8.010645161290324e-05, + "loss": 0.1538, + "step": 12835 + }, + { + "epoch": 0.205376, + "grad_norm": 1.2421875, + "learning_rate": 8.010483870967742e-05, + "loss": 0.1742, + "step": 12836 + }, + { + "epoch": 0.205392, + "grad_norm": 1.0703125, + "learning_rate": 8.010322580645162e-05, + "loss": 0.1936, + "step": 12837 + }, + { + "epoch": 0.205408, + "grad_norm": 0.75390625, + "learning_rate": 8.010161290322581e-05, + "loss": 0.1738, + "step": 12838 + }, + { + "epoch": 0.205424, + "grad_norm": 0.7578125, + "learning_rate": 8.010000000000001e-05, + "loss": 0.1701, + "step": 12839 + }, + { + "epoch": 0.20544, + "grad_norm": 0.85546875, + "learning_rate": 8.00983870967742e-05, + "loss": 0.1551, + "step": 12840 + }, + { + "epoch": 0.205456, + "grad_norm": 0.83203125, + "learning_rate": 8.00967741935484e-05, + "loss": 0.1968, + "step": 12841 + }, + { + "epoch": 0.205472, + "grad_norm": 0.69921875, + "learning_rate": 8.009516129032258e-05, + "loss": 0.2098, + "step": 12842 + }, + { + "epoch": 0.205488, + "grad_norm": 0.95703125, + "learning_rate": 8.009354838709677e-05, + "loss": 0.1911, + "step": 12843 + }, + { + "epoch": 0.205504, + "grad_norm": 0.90234375, + "learning_rate": 8.009193548387097e-05, + "loss": 0.1885, + "step": 12844 + }, + { + "epoch": 0.20552, + "grad_norm": 0.68359375, + "learning_rate": 8.009032258064517e-05, + "loss": 0.1485, + "step": 12845 + }, + { + "epoch": 0.205536, + "grad_norm": 0.7265625, + "learning_rate": 8.008870967741937e-05, + "loss": 0.1287, + "step": 12846 + }, + { + "epoch": 0.205552, + "grad_norm": 0.7578125, + "learning_rate": 8.008709677419355e-05, + "loss": 0.1509, + "step": 12847 + }, + { + "epoch": 0.205568, + "grad_norm": 0.72265625, + "learning_rate": 8.008548387096775e-05, + "loss": 0.1651, + "step": 12848 + }, + { + "epoch": 0.205584, + "grad_norm": 0.58984375, + "learning_rate": 8.008387096774194e-05, + "loss": 0.1702, + "step": 12849 + }, + { + "epoch": 0.2056, + "grad_norm": 1.0234375, + "learning_rate": 8.008225806451614e-05, + "loss": 0.1706, + "step": 12850 + }, + { + "epoch": 0.205616, + "grad_norm": 0.91015625, + "learning_rate": 8.008064516129032e-05, + "loss": 0.1763, + "step": 12851 + }, + { + "epoch": 0.205632, + "grad_norm": 0.8359375, + "learning_rate": 8.007903225806452e-05, + "loss": 0.1803, + "step": 12852 + }, + { + "epoch": 0.205648, + "grad_norm": 0.61328125, + "learning_rate": 8.007741935483871e-05, + "loss": 0.157, + "step": 12853 + }, + { + "epoch": 0.205664, + "grad_norm": 0.96484375, + "learning_rate": 8.007580645161291e-05, + "loss": 0.1718, + "step": 12854 + }, + { + "epoch": 0.20568, + "grad_norm": 0.9375, + "learning_rate": 8.00741935483871e-05, + "loss": 0.1851, + "step": 12855 + }, + { + "epoch": 0.205696, + "grad_norm": 0.59375, + "learning_rate": 8.00725806451613e-05, + "loss": 0.1995, + "step": 12856 + }, + { + "epoch": 0.205712, + "grad_norm": 0.61328125, + "learning_rate": 8.00709677419355e-05, + "loss": 0.1747, + "step": 12857 + }, + { + "epoch": 0.205728, + "grad_norm": 0.9921875, + "learning_rate": 8.006935483870968e-05, + "loss": 0.1629, + "step": 12858 + }, + { + "epoch": 0.205744, + "grad_norm": 0.69140625, + "learning_rate": 8.006774193548388e-05, + "loss": 0.1857, + "step": 12859 + }, + { + "epoch": 0.20576, + "grad_norm": 0.6484375, + "learning_rate": 8.006612903225807e-05, + "loss": 0.1876, + "step": 12860 + }, + { + "epoch": 0.205776, + "grad_norm": 0.625, + "learning_rate": 8.006451612903227e-05, + "loss": 0.1799, + "step": 12861 + }, + { + "epoch": 0.205792, + "grad_norm": 0.89453125, + "learning_rate": 8.006290322580645e-05, + "loss": 0.1846, + "step": 12862 + }, + { + "epoch": 0.205808, + "grad_norm": 1.140625, + "learning_rate": 8.006129032258065e-05, + "loss": 0.2069, + "step": 12863 + }, + { + "epoch": 0.205824, + "grad_norm": 0.96875, + "learning_rate": 8.005967741935484e-05, + "loss": 0.1562, + "step": 12864 + }, + { + "epoch": 0.20584, + "grad_norm": 0.81640625, + "learning_rate": 8.005806451612904e-05, + "loss": 0.2164, + "step": 12865 + }, + { + "epoch": 0.205856, + "grad_norm": 1.0, + "learning_rate": 8.005645161290322e-05, + "loss": 0.1719, + "step": 12866 + }, + { + "epoch": 0.205872, + "grad_norm": 1.0234375, + "learning_rate": 8.005483870967742e-05, + "loss": 0.1645, + "step": 12867 + }, + { + "epoch": 0.205888, + "grad_norm": 0.6875, + "learning_rate": 8.005322580645161e-05, + "loss": 0.152, + "step": 12868 + }, + { + "epoch": 0.205904, + "grad_norm": 1.0390625, + "learning_rate": 8.005161290322581e-05, + "loss": 0.2371, + "step": 12869 + }, + { + "epoch": 0.20592, + "grad_norm": 0.80859375, + "learning_rate": 8.005000000000001e-05, + "loss": 0.1333, + "step": 12870 + }, + { + "epoch": 0.205936, + "grad_norm": 0.7421875, + "learning_rate": 8.004838709677421e-05, + "loss": 0.1417, + "step": 12871 + }, + { + "epoch": 0.205952, + "grad_norm": 1.0703125, + "learning_rate": 8.00467741935484e-05, + "loss": 0.1728, + "step": 12872 + }, + { + "epoch": 0.205968, + "grad_norm": 0.76953125, + "learning_rate": 8.004516129032258e-05, + "loss": 0.1585, + "step": 12873 + }, + { + "epoch": 0.205984, + "grad_norm": 0.90234375, + "learning_rate": 8.004354838709678e-05, + "loss": 0.1723, + "step": 12874 + }, + { + "epoch": 0.206, + "grad_norm": 1.078125, + "learning_rate": 8.004193548387097e-05, + "loss": 0.1797, + "step": 12875 + }, + { + "epoch": 0.206016, + "grad_norm": 0.92578125, + "learning_rate": 8.004032258064516e-05, + "loss": 0.1648, + "step": 12876 + }, + { + "epoch": 0.206032, + "grad_norm": 0.984375, + "learning_rate": 8.003870967741935e-05, + "loss": 0.1446, + "step": 12877 + }, + { + "epoch": 0.206048, + "grad_norm": 0.91015625, + "learning_rate": 8.003709677419355e-05, + "loss": 0.1403, + "step": 12878 + }, + { + "epoch": 0.206064, + "grad_norm": 1.03125, + "learning_rate": 8.003548387096774e-05, + "loss": 0.1489, + "step": 12879 + }, + { + "epoch": 0.20608, + "grad_norm": 0.53515625, + "learning_rate": 8.003387096774194e-05, + "loss": 0.1275, + "step": 12880 + }, + { + "epoch": 0.206096, + "grad_norm": 0.49609375, + "learning_rate": 8.003225806451614e-05, + "loss": 0.1253, + "step": 12881 + }, + { + "epoch": 0.206112, + "grad_norm": 0.734375, + "learning_rate": 8.003064516129034e-05, + "loss": 0.1392, + "step": 12882 + }, + { + "epoch": 0.206128, + "grad_norm": 0.6796875, + "learning_rate": 8.002903225806452e-05, + "loss": 0.1597, + "step": 12883 + }, + { + "epoch": 0.206144, + "grad_norm": 0.482421875, + "learning_rate": 8.002741935483872e-05, + "loss": 0.12, + "step": 12884 + }, + { + "epoch": 0.20616, + "grad_norm": 0.71875, + "learning_rate": 8.002580645161291e-05, + "loss": 0.1748, + "step": 12885 + }, + { + "epoch": 0.206176, + "grad_norm": 0.74609375, + "learning_rate": 8.002419354838711e-05, + "loss": 0.181, + "step": 12886 + }, + { + "epoch": 0.206192, + "grad_norm": 0.640625, + "learning_rate": 8.002258064516129e-05, + "loss": 0.1383, + "step": 12887 + }, + { + "epoch": 0.206208, + "grad_norm": 0.83203125, + "learning_rate": 8.002096774193549e-05, + "loss": 0.116, + "step": 12888 + }, + { + "epoch": 0.206224, + "grad_norm": 0.6796875, + "learning_rate": 8.001935483870968e-05, + "loss": 0.1461, + "step": 12889 + }, + { + "epoch": 0.20624, + "grad_norm": 0.75, + "learning_rate": 8.001774193548386e-05, + "loss": 0.177, + "step": 12890 + }, + { + "epoch": 0.206256, + "grad_norm": 0.92578125, + "learning_rate": 8.001612903225806e-05, + "loss": 0.135, + "step": 12891 + }, + { + "epoch": 0.206272, + "grad_norm": 0.84375, + "learning_rate": 8.001451612903226e-05, + "loss": 0.1851, + "step": 12892 + }, + { + "epoch": 0.206288, + "grad_norm": 0.85546875, + "learning_rate": 8.001290322580646e-05, + "loss": 0.154, + "step": 12893 + }, + { + "epoch": 0.206304, + "grad_norm": 0.6796875, + "learning_rate": 8.001129032258065e-05, + "loss": 0.1645, + "step": 12894 + }, + { + "epoch": 0.20632, + "grad_norm": 0.7265625, + "learning_rate": 8.000967741935485e-05, + "loss": 0.142, + "step": 12895 + }, + { + "epoch": 0.206336, + "grad_norm": 0.71875, + "learning_rate": 8.000806451612904e-05, + "loss": 0.1947, + "step": 12896 + }, + { + "epoch": 0.206352, + "grad_norm": 0.6171875, + "learning_rate": 8.000645161290324e-05, + "loss": 0.1467, + "step": 12897 + }, + { + "epoch": 0.206368, + "grad_norm": 0.9296875, + "learning_rate": 8.000483870967742e-05, + "loss": 0.2129, + "step": 12898 + }, + { + "epoch": 0.206384, + "grad_norm": 0.66015625, + "learning_rate": 8.000322580645162e-05, + "loss": 0.1632, + "step": 12899 + }, + { + "epoch": 0.2064, + "grad_norm": 1.5, + "learning_rate": 8.000161290322581e-05, + "loss": 0.2008, + "step": 12900 + }, + { + "epoch": 0.206416, + "grad_norm": 1.6171875, + "learning_rate": 8e-05, + "loss": 0.2057, + "step": 12901 + }, + { + "epoch": 0.206432, + "grad_norm": 0.734375, + "learning_rate": 7.999838709677419e-05, + "loss": 0.1648, + "step": 12902 + }, + { + "epoch": 0.206448, + "grad_norm": 0.76171875, + "learning_rate": 7.999677419354839e-05, + "loss": 0.2077, + "step": 12903 + }, + { + "epoch": 0.206464, + "grad_norm": 0.51953125, + "learning_rate": 7.999516129032258e-05, + "loss": 0.1449, + "step": 12904 + }, + { + "epoch": 0.20648, + "grad_norm": 0.8984375, + "learning_rate": 7.999354838709678e-05, + "loss": 0.1579, + "step": 12905 + }, + { + "epoch": 0.206496, + "grad_norm": 0.73828125, + "learning_rate": 7.999193548387098e-05, + "loss": 0.1627, + "step": 12906 + }, + { + "epoch": 0.206512, + "grad_norm": 0.96484375, + "learning_rate": 7.999032258064516e-05, + "loss": 0.18, + "step": 12907 + }, + { + "epoch": 0.206528, + "grad_norm": 0.8125, + "learning_rate": 7.998870967741936e-05, + "loss": 0.149, + "step": 12908 + }, + { + "epoch": 0.206544, + "grad_norm": 0.67578125, + "learning_rate": 7.998709677419355e-05, + "loss": 0.1618, + "step": 12909 + }, + { + "epoch": 0.20656, + "grad_norm": 0.66796875, + "learning_rate": 7.998548387096775e-05, + "loss": 0.155, + "step": 12910 + }, + { + "epoch": 0.206576, + "grad_norm": 0.94140625, + "learning_rate": 7.998387096774194e-05, + "loss": 0.1875, + "step": 12911 + }, + { + "epoch": 0.206592, + "grad_norm": 1.03125, + "learning_rate": 7.998225806451613e-05, + "loss": 0.1857, + "step": 12912 + }, + { + "epoch": 0.206608, + "grad_norm": 1.3671875, + "learning_rate": 7.998064516129032e-05, + "loss": 0.2197, + "step": 12913 + }, + { + "epoch": 0.206624, + "grad_norm": 0.7421875, + "learning_rate": 7.997903225806452e-05, + "loss": 0.194, + "step": 12914 + }, + { + "epoch": 0.20664, + "grad_norm": 0.65234375, + "learning_rate": 7.99774193548387e-05, + "loss": 0.1468, + "step": 12915 + }, + { + "epoch": 0.206656, + "grad_norm": 0.90234375, + "learning_rate": 7.99758064516129e-05, + "loss": 0.1348, + "step": 12916 + }, + { + "epoch": 0.206672, + "grad_norm": 0.8984375, + "learning_rate": 7.99741935483871e-05, + "loss": 0.1701, + "step": 12917 + }, + { + "epoch": 0.206688, + "grad_norm": 0.73046875, + "learning_rate": 7.99725806451613e-05, + "loss": 0.1612, + "step": 12918 + }, + { + "epoch": 0.206704, + "grad_norm": 1.4453125, + "learning_rate": 7.997096774193549e-05, + "loss": 0.1735, + "step": 12919 + }, + { + "epoch": 0.20672, + "grad_norm": 0.5703125, + "learning_rate": 7.996935483870968e-05, + "loss": 0.144, + "step": 12920 + }, + { + "epoch": 0.206736, + "grad_norm": 0.56640625, + "learning_rate": 7.996774193548388e-05, + "loss": 0.1471, + "step": 12921 + }, + { + "epoch": 0.206752, + "grad_norm": 0.90625, + "learning_rate": 7.996612903225806e-05, + "loss": 0.1852, + "step": 12922 + }, + { + "epoch": 0.206768, + "grad_norm": 0.7265625, + "learning_rate": 7.996451612903226e-05, + "loss": 0.1726, + "step": 12923 + }, + { + "epoch": 0.206784, + "grad_norm": 1.078125, + "learning_rate": 7.996290322580645e-05, + "loss": 0.2093, + "step": 12924 + }, + { + "epoch": 0.2068, + "grad_norm": 0.94140625, + "learning_rate": 7.996129032258065e-05, + "loss": 0.1849, + "step": 12925 + }, + { + "epoch": 0.206816, + "grad_norm": 0.80078125, + "learning_rate": 7.995967741935483e-05, + "loss": 0.1944, + "step": 12926 + }, + { + "epoch": 0.206832, + "grad_norm": 0.64453125, + "learning_rate": 7.995806451612903e-05, + "loss": 0.1761, + "step": 12927 + }, + { + "epoch": 0.206848, + "grad_norm": 0.7734375, + "learning_rate": 7.995645161290322e-05, + "loss": 0.1816, + "step": 12928 + }, + { + "epoch": 0.206864, + "grad_norm": 0.703125, + "learning_rate": 7.995483870967742e-05, + "loss": 0.1657, + "step": 12929 + }, + { + "epoch": 0.20688, + "grad_norm": 1.0703125, + "learning_rate": 7.995322580645162e-05, + "loss": 0.2071, + "step": 12930 + }, + { + "epoch": 0.206896, + "grad_norm": 0.859375, + "learning_rate": 7.995161290322582e-05, + "loss": 0.16, + "step": 12931 + }, + { + "epoch": 0.206912, + "grad_norm": 1.03125, + "learning_rate": 7.995e-05, + "loss": 0.1838, + "step": 12932 + }, + { + "epoch": 0.206928, + "grad_norm": 1.53125, + "learning_rate": 7.99483870967742e-05, + "loss": 0.2112, + "step": 12933 + }, + { + "epoch": 0.206944, + "grad_norm": 0.734375, + "learning_rate": 7.994677419354839e-05, + "loss": 0.1597, + "step": 12934 + }, + { + "epoch": 0.20696, + "grad_norm": 0.67578125, + "learning_rate": 7.994516129032259e-05, + "loss": 0.1761, + "step": 12935 + }, + { + "epoch": 0.206976, + "grad_norm": 1.0546875, + "learning_rate": 7.994354838709678e-05, + "loss": 0.1768, + "step": 12936 + }, + { + "epoch": 0.206992, + "grad_norm": 0.953125, + "learning_rate": 7.994193548387096e-05, + "loss": 0.1846, + "step": 12937 + }, + { + "epoch": 0.207008, + "grad_norm": 0.6796875, + "learning_rate": 7.994032258064516e-05, + "loss": 0.1564, + "step": 12938 + }, + { + "epoch": 0.207024, + "grad_norm": 1.515625, + "learning_rate": 7.993870967741935e-05, + "loss": 0.1769, + "step": 12939 + }, + { + "epoch": 0.20704, + "grad_norm": 0.54296875, + "learning_rate": 7.993709677419355e-05, + "loss": 0.1749, + "step": 12940 + }, + { + "epoch": 0.207056, + "grad_norm": 0.890625, + "learning_rate": 7.993548387096775e-05, + "loss": 0.2006, + "step": 12941 + }, + { + "epoch": 0.207072, + "grad_norm": 0.828125, + "learning_rate": 7.993387096774195e-05, + "loss": 0.1855, + "step": 12942 + }, + { + "epoch": 0.207088, + "grad_norm": 0.71875, + "learning_rate": 7.993225806451613e-05, + "loss": 0.1701, + "step": 12943 + }, + { + "epoch": 0.207104, + "grad_norm": 0.640625, + "learning_rate": 7.993064516129033e-05, + "loss": 0.2015, + "step": 12944 + }, + { + "epoch": 0.20712, + "grad_norm": 0.80859375, + "learning_rate": 7.992903225806452e-05, + "loss": 0.1762, + "step": 12945 + }, + { + "epoch": 0.207136, + "grad_norm": 0.76171875, + "learning_rate": 7.992741935483872e-05, + "loss": 0.1836, + "step": 12946 + }, + { + "epoch": 0.207152, + "grad_norm": 0.84375, + "learning_rate": 7.99258064516129e-05, + "loss": 0.1564, + "step": 12947 + }, + { + "epoch": 0.207168, + "grad_norm": 0.90234375, + "learning_rate": 7.99241935483871e-05, + "loss": 0.1738, + "step": 12948 + }, + { + "epoch": 0.207184, + "grad_norm": 1.1171875, + "learning_rate": 7.992258064516129e-05, + "loss": 0.2304, + "step": 12949 + }, + { + "epoch": 0.2072, + "grad_norm": 1.0859375, + "learning_rate": 7.992096774193549e-05, + "loss": 0.1941, + "step": 12950 + }, + { + "epoch": 0.207216, + "grad_norm": 0.87109375, + "learning_rate": 7.991935483870968e-05, + "loss": 0.1845, + "step": 12951 + }, + { + "epoch": 0.207232, + "grad_norm": 0.73828125, + "learning_rate": 7.991774193548388e-05, + "loss": 0.2174, + "step": 12952 + }, + { + "epoch": 0.207248, + "grad_norm": 0.85546875, + "learning_rate": 7.991612903225808e-05, + "loss": 0.1431, + "step": 12953 + }, + { + "epoch": 0.207264, + "grad_norm": 0.87109375, + "learning_rate": 7.991451612903226e-05, + "loss": 0.1955, + "step": 12954 + }, + { + "epoch": 0.20728, + "grad_norm": 0.84765625, + "learning_rate": 7.991290322580646e-05, + "loss": 0.1576, + "step": 12955 + }, + { + "epoch": 0.207296, + "grad_norm": 0.8359375, + "learning_rate": 7.991129032258065e-05, + "loss": 0.1531, + "step": 12956 + }, + { + "epoch": 0.207312, + "grad_norm": 0.96484375, + "learning_rate": 7.990967741935485e-05, + "loss": 0.1768, + "step": 12957 + }, + { + "epoch": 0.207328, + "grad_norm": 0.9453125, + "learning_rate": 7.990806451612903e-05, + "loss": 0.2144, + "step": 12958 + }, + { + "epoch": 0.207344, + "grad_norm": 0.8203125, + "learning_rate": 7.990645161290323e-05, + "loss": 0.1666, + "step": 12959 + }, + { + "epoch": 0.20736, + "grad_norm": 1.28125, + "learning_rate": 7.990483870967742e-05, + "loss": 0.1785, + "step": 12960 + }, + { + "epoch": 0.207376, + "grad_norm": 0.70703125, + "learning_rate": 7.990322580645162e-05, + "loss": 0.175, + "step": 12961 + }, + { + "epoch": 0.207392, + "grad_norm": 0.72265625, + "learning_rate": 7.99016129032258e-05, + "loss": 0.19, + "step": 12962 + }, + { + "epoch": 0.207408, + "grad_norm": 0.53125, + "learning_rate": 7.99e-05, + "loss": 0.1331, + "step": 12963 + }, + { + "epoch": 0.207424, + "grad_norm": 1.2265625, + "learning_rate": 7.989838709677419e-05, + "loss": 0.1946, + "step": 12964 + }, + { + "epoch": 0.20744, + "grad_norm": 0.55078125, + "learning_rate": 7.989677419354839e-05, + "loss": 0.1656, + "step": 12965 + }, + { + "epoch": 0.207456, + "grad_norm": 0.890625, + "learning_rate": 7.989516129032259e-05, + "loss": 0.1769, + "step": 12966 + }, + { + "epoch": 0.207472, + "grad_norm": 2.125, + "learning_rate": 7.989354838709678e-05, + "loss": 0.1867, + "step": 12967 + }, + { + "epoch": 0.207488, + "grad_norm": 0.546875, + "learning_rate": 7.989193548387098e-05, + "loss": 0.1344, + "step": 12968 + }, + { + "epoch": 0.207504, + "grad_norm": 0.796875, + "learning_rate": 7.989032258064516e-05, + "loss": 0.1472, + "step": 12969 + }, + { + "epoch": 0.20752, + "grad_norm": 1.3984375, + "learning_rate": 7.988870967741936e-05, + "loss": 0.1477, + "step": 12970 + }, + { + "epoch": 0.207536, + "grad_norm": 0.87109375, + "learning_rate": 7.988709677419355e-05, + "loss": 0.1714, + "step": 12971 + }, + { + "epoch": 0.207552, + "grad_norm": 0.9140625, + "learning_rate": 7.988548387096775e-05, + "loss": 0.1684, + "step": 12972 + }, + { + "epoch": 0.207568, + "grad_norm": 0.89453125, + "learning_rate": 7.988387096774193e-05, + "loss": 0.1528, + "step": 12973 + }, + { + "epoch": 0.207584, + "grad_norm": 0.7734375, + "learning_rate": 7.988225806451613e-05, + "loss": 0.2059, + "step": 12974 + }, + { + "epoch": 0.2076, + "grad_norm": 1.203125, + "learning_rate": 7.988064516129032e-05, + "loss": 0.1905, + "step": 12975 + }, + { + "epoch": 0.207616, + "grad_norm": 0.83984375, + "learning_rate": 7.987903225806452e-05, + "loss": 0.1827, + "step": 12976 + }, + { + "epoch": 0.207632, + "grad_norm": 0.69921875, + "learning_rate": 7.987741935483872e-05, + "loss": 0.1714, + "step": 12977 + }, + { + "epoch": 0.207648, + "grad_norm": 0.765625, + "learning_rate": 7.987580645161292e-05, + "loss": 0.1841, + "step": 12978 + }, + { + "epoch": 0.207664, + "grad_norm": 0.5546875, + "learning_rate": 7.98741935483871e-05, + "loss": 0.1613, + "step": 12979 + }, + { + "epoch": 0.20768, + "grad_norm": 0.60546875, + "learning_rate": 7.98725806451613e-05, + "loss": 0.1501, + "step": 12980 + }, + { + "epoch": 0.207696, + "grad_norm": 0.84375, + "learning_rate": 7.987096774193549e-05, + "loss": 0.1961, + "step": 12981 + }, + { + "epoch": 0.207712, + "grad_norm": 0.9453125, + "learning_rate": 7.986935483870968e-05, + "loss": 0.1945, + "step": 12982 + }, + { + "epoch": 0.207728, + "grad_norm": 1.21875, + "learning_rate": 7.986774193548387e-05, + "loss": 0.192, + "step": 12983 + }, + { + "epoch": 0.207744, + "grad_norm": 1.046875, + "learning_rate": 7.986612903225806e-05, + "loss": 0.1881, + "step": 12984 + }, + { + "epoch": 0.20776, + "grad_norm": 0.76953125, + "learning_rate": 7.986451612903226e-05, + "loss": 0.1948, + "step": 12985 + }, + { + "epoch": 0.207776, + "grad_norm": 0.80078125, + "learning_rate": 7.986290322580645e-05, + "loss": 0.1664, + "step": 12986 + }, + { + "epoch": 0.207792, + "grad_norm": 0.73828125, + "learning_rate": 7.986129032258065e-05, + "loss": 0.1786, + "step": 12987 + }, + { + "epoch": 0.207808, + "grad_norm": 1.484375, + "learning_rate": 7.985967741935485e-05, + "loss": 0.1619, + "step": 12988 + }, + { + "epoch": 0.207824, + "grad_norm": 0.9296875, + "learning_rate": 7.985806451612903e-05, + "loss": 0.1985, + "step": 12989 + }, + { + "epoch": 0.20784, + "grad_norm": 0.58203125, + "learning_rate": 7.985645161290323e-05, + "loss": 0.188, + "step": 12990 + }, + { + "epoch": 0.207856, + "grad_norm": 0.94140625, + "learning_rate": 7.985483870967743e-05, + "loss": 0.1332, + "step": 12991 + }, + { + "epoch": 0.207872, + "grad_norm": 1.2265625, + "learning_rate": 7.985322580645162e-05, + "loss": 0.1705, + "step": 12992 + }, + { + "epoch": 0.207888, + "grad_norm": 0.73046875, + "learning_rate": 7.985161290322582e-05, + "loss": 0.1933, + "step": 12993 + }, + { + "epoch": 0.207904, + "grad_norm": 1.390625, + "learning_rate": 7.985e-05, + "loss": 0.1947, + "step": 12994 + }, + { + "epoch": 0.20792, + "grad_norm": 0.98046875, + "learning_rate": 7.98483870967742e-05, + "loss": 0.1744, + "step": 12995 + }, + { + "epoch": 0.207936, + "grad_norm": 0.8046875, + "learning_rate": 7.984677419354839e-05, + "loss": 0.1988, + "step": 12996 + }, + { + "epoch": 0.207952, + "grad_norm": 0.7890625, + "learning_rate": 7.984516129032259e-05, + "loss": 0.1879, + "step": 12997 + }, + { + "epoch": 0.207968, + "grad_norm": 1.171875, + "learning_rate": 7.984354838709677e-05, + "loss": 0.1955, + "step": 12998 + }, + { + "epoch": 0.207984, + "grad_norm": 0.734375, + "learning_rate": 7.984193548387096e-05, + "loss": 0.183, + "step": 12999 + }, + { + "epoch": 0.208, + "grad_norm": 0.78515625, + "learning_rate": 7.984032258064516e-05, + "loss": 0.162, + "step": 13000 + }, + { + "epoch": 0.208016, + "grad_norm": 1.109375, + "learning_rate": 7.983870967741936e-05, + "loss": 0.1836, + "step": 13001 + }, + { + "epoch": 0.208032, + "grad_norm": 0.69921875, + "learning_rate": 7.983709677419356e-05, + "loss": 0.1716, + "step": 13002 + }, + { + "epoch": 0.208048, + "grad_norm": 1.0390625, + "learning_rate": 7.983548387096775e-05, + "loss": 0.1633, + "step": 13003 + }, + { + "epoch": 0.208064, + "grad_norm": 1.0546875, + "learning_rate": 7.983387096774194e-05, + "loss": 0.1484, + "step": 13004 + }, + { + "epoch": 0.20808, + "grad_norm": 0.9765625, + "learning_rate": 7.983225806451613e-05, + "loss": 0.148, + "step": 13005 + }, + { + "epoch": 0.208096, + "grad_norm": 0.73828125, + "learning_rate": 7.983064516129033e-05, + "loss": 0.1983, + "step": 13006 + }, + { + "epoch": 0.208112, + "grad_norm": 0.73828125, + "learning_rate": 7.982903225806452e-05, + "loss": 0.1988, + "step": 13007 + }, + { + "epoch": 0.208128, + "grad_norm": 0.890625, + "learning_rate": 7.982741935483872e-05, + "loss": 0.1535, + "step": 13008 + }, + { + "epoch": 0.208144, + "grad_norm": 0.890625, + "learning_rate": 7.98258064516129e-05, + "loss": 0.173, + "step": 13009 + }, + { + "epoch": 0.20816, + "grad_norm": 0.9375, + "learning_rate": 7.98241935483871e-05, + "loss": 0.162, + "step": 13010 + }, + { + "epoch": 0.208176, + "grad_norm": 0.8515625, + "learning_rate": 7.982258064516129e-05, + "loss": 0.1434, + "step": 13011 + }, + { + "epoch": 0.208192, + "grad_norm": 0.91796875, + "learning_rate": 7.982096774193549e-05, + "loss": 0.1779, + "step": 13012 + }, + { + "epoch": 0.208208, + "grad_norm": 1.6015625, + "learning_rate": 7.981935483870969e-05, + "loss": 0.1958, + "step": 13013 + }, + { + "epoch": 0.208224, + "grad_norm": 0.6015625, + "learning_rate": 7.981774193548387e-05, + "loss": 0.1623, + "step": 13014 + }, + { + "epoch": 0.20824, + "grad_norm": 0.70703125, + "learning_rate": 7.981612903225807e-05, + "loss": 0.1919, + "step": 13015 + }, + { + "epoch": 0.208256, + "grad_norm": 1.0078125, + "learning_rate": 7.981451612903226e-05, + "loss": 0.1276, + "step": 13016 + }, + { + "epoch": 0.208272, + "grad_norm": 0.6328125, + "learning_rate": 7.981290322580646e-05, + "loss": 0.1212, + "step": 13017 + }, + { + "epoch": 0.208288, + "grad_norm": 0.5546875, + "learning_rate": 7.981129032258064e-05, + "loss": 0.172, + "step": 13018 + }, + { + "epoch": 0.208304, + "grad_norm": 0.79296875, + "learning_rate": 7.980967741935484e-05, + "loss": 0.1822, + "step": 13019 + }, + { + "epoch": 0.20832, + "grad_norm": 1.046875, + "learning_rate": 7.980806451612903e-05, + "loss": 0.1964, + "step": 13020 + }, + { + "epoch": 0.208336, + "grad_norm": 0.80078125, + "learning_rate": 7.980645161290323e-05, + "loss": 0.1748, + "step": 13021 + }, + { + "epoch": 0.208352, + "grad_norm": 0.6484375, + "learning_rate": 7.980483870967742e-05, + "loss": 0.2067, + "step": 13022 + }, + { + "epoch": 0.208368, + "grad_norm": 0.494140625, + "learning_rate": 7.980322580645162e-05, + "loss": 0.1557, + "step": 13023 + }, + { + "epoch": 0.208384, + "grad_norm": 0.96484375, + "learning_rate": 7.98016129032258e-05, + "loss": 0.1881, + "step": 13024 + }, + { + "epoch": 0.2084, + "grad_norm": 1.5390625, + "learning_rate": 7.98e-05, + "loss": 0.198, + "step": 13025 + }, + { + "epoch": 0.208416, + "grad_norm": 0.73828125, + "learning_rate": 7.97983870967742e-05, + "loss": 0.1561, + "step": 13026 + }, + { + "epoch": 0.208432, + "grad_norm": 1.1796875, + "learning_rate": 7.97967741935484e-05, + "loss": 0.1763, + "step": 13027 + }, + { + "epoch": 0.208448, + "grad_norm": 0.8359375, + "learning_rate": 7.979516129032259e-05, + "loss": 0.1951, + "step": 13028 + }, + { + "epoch": 0.208464, + "grad_norm": 1.1328125, + "learning_rate": 7.979354838709677e-05, + "loss": 0.1878, + "step": 13029 + }, + { + "epoch": 0.20848, + "grad_norm": 0.77734375, + "learning_rate": 7.979193548387097e-05, + "loss": 0.1731, + "step": 13030 + }, + { + "epoch": 0.208496, + "grad_norm": 0.59765625, + "learning_rate": 7.979032258064516e-05, + "loss": 0.1935, + "step": 13031 + }, + { + "epoch": 0.208512, + "grad_norm": 0.671875, + "learning_rate": 7.978870967741936e-05, + "loss": 0.1826, + "step": 13032 + }, + { + "epoch": 0.208528, + "grad_norm": 0.94140625, + "learning_rate": 7.978709677419354e-05, + "loss": 0.1824, + "step": 13033 + }, + { + "epoch": 0.208544, + "grad_norm": 0.640625, + "learning_rate": 7.978548387096774e-05, + "loss": 0.1538, + "step": 13034 + }, + { + "epoch": 0.20856, + "grad_norm": 1.34375, + "learning_rate": 7.978387096774193e-05, + "loss": 0.1546, + "step": 13035 + }, + { + "epoch": 0.208576, + "grad_norm": 0.70703125, + "learning_rate": 7.978225806451613e-05, + "loss": 0.1782, + "step": 13036 + }, + { + "epoch": 0.208592, + "grad_norm": 0.7734375, + "learning_rate": 7.978064516129033e-05, + "loss": 0.2026, + "step": 13037 + }, + { + "epoch": 0.208608, + "grad_norm": 0.6171875, + "learning_rate": 7.977903225806453e-05, + "loss": 0.1395, + "step": 13038 + }, + { + "epoch": 0.208624, + "grad_norm": 0.65234375, + "learning_rate": 7.977741935483872e-05, + "loss": 0.1722, + "step": 13039 + }, + { + "epoch": 0.20864, + "grad_norm": 0.8828125, + "learning_rate": 7.977580645161291e-05, + "loss": 0.1531, + "step": 13040 + }, + { + "epoch": 0.208656, + "grad_norm": 1.1171875, + "learning_rate": 7.97741935483871e-05, + "loss": 0.2305, + "step": 13041 + }, + { + "epoch": 0.208672, + "grad_norm": 0.91015625, + "learning_rate": 7.97725806451613e-05, + "loss": 0.2254, + "step": 13042 + }, + { + "epoch": 0.208688, + "grad_norm": 0.7578125, + "learning_rate": 7.977096774193549e-05, + "loss": 0.1838, + "step": 13043 + }, + { + "epoch": 0.208704, + "grad_norm": 0.7265625, + "learning_rate": 7.976935483870969e-05, + "loss": 0.1814, + "step": 13044 + }, + { + "epoch": 0.20872, + "grad_norm": 0.796875, + "learning_rate": 7.976774193548387e-05, + "loss": 0.1825, + "step": 13045 + }, + { + "epoch": 0.208736, + "grad_norm": 0.6796875, + "learning_rate": 7.976612903225806e-05, + "loss": 0.1529, + "step": 13046 + }, + { + "epoch": 0.208752, + "grad_norm": 0.8125, + "learning_rate": 7.976451612903226e-05, + "loss": 0.1546, + "step": 13047 + }, + { + "epoch": 0.208768, + "grad_norm": 0.5234375, + "learning_rate": 7.976290322580646e-05, + "loss": 0.1715, + "step": 13048 + }, + { + "epoch": 0.208784, + "grad_norm": 0.55859375, + "learning_rate": 7.976129032258066e-05, + "loss": 0.1657, + "step": 13049 + }, + { + "epoch": 0.2088, + "grad_norm": 0.94140625, + "learning_rate": 7.975967741935484e-05, + "loss": 0.2039, + "step": 13050 + }, + { + "epoch": 0.208816, + "grad_norm": 0.890625, + "learning_rate": 7.975806451612904e-05, + "loss": 0.1688, + "step": 13051 + }, + { + "epoch": 0.208832, + "grad_norm": 0.61328125, + "learning_rate": 7.975645161290323e-05, + "loss": 0.1405, + "step": 13052 + }, + { + "epoch": 0.208848, + "grad_norm": 0.6015625, + "learning_rate": 7.975483870967743e-05, + "loss": 0.1719, + "step": 13053 + }, + { + "epoch": 0.208864, + "grad_norm": 0.55078125, + "learning_rate": 7.975322580645161e-05, + "loss": 0.1433, + "step": 13054 + }, + { + "epoch": 0.20888, + "grad_norm": 0.51953125, + "learning_rate": 7.975161290322581e-05, + "loss": 0.1689, + "step": 13055 + }, + { + "epoch": 0.208896, + "grad_norm": 1.078125, + "learning_rate": 7.975e-05, + "loss": 0.1911, + "step": 13056 + }, + { + "epoch": 0.208912, + "grad_norm": 0.65625, + "learning_rate": 7.97483870967742e-05, + "loss": 0.1554, + "step": 13057 + }, + { + "epoch": 0.208928, + "grad_norm": 0.73046875, + "learning_rate": 7.974677419354839e-05, + "loss": 0.1954, + "step": 13058 + }, + { + "epoch": 0.208944, + "grad_norm": 0.82421875, + "learning_rate": 7.974516129032259e-05, + "loss": 0.1977, + "step": 13059 + }, + { + "epoch": 0.20896, + "grad_norm": 0.671875, + "learning_rate": 7.974354838709677e-05, + "loss": 0.1343, + "step": 13060 + }, + { + "epoch": 0.208976, + "grad_norm": 1.0234375, + "learning_rate": 7.974193548387097e-05, + "loss": 0.1578, + "step": 13061 + }, + { + "epoch": 0.208992, + "grad_norm": 0.49609375, + "learning_rate": 7.974032258064517e-05, + "loss": 0.1525, + "step": 13062 + }, + { + "epoch": 0.209008, + "grad_norm": 0.58203125, + "learning_rate": 7.973870967741936e-05, + "loss": 0.1603, + "step": 13063 + }, + { + "epoch": 0.209024, + "grad_norm": 0.90234375, + "learning_rate": 7.973709677419356e-05, + "loss": 0.1884, + "step": 13064 + }, + { + "epoch": 0.20904, + "grad_norm": 0.80078125, + "learning_rate": 7.973548387096774e-05, + "loss": 0.1629, + "step": 13065 + }, + { + "epoch": 0.209056, + "grad_norm": 0.6875, + "learning_rate": 7.973387096774194e-05, + "loss": 0.1964, + "step": 13066 + }, + { + "epoch": 0.209072, + "grad_norm": 1.234375, + "learning_rate": 7.973225806451613e-05, + "loss": 0.099, + "step": 13067 + }, + { + "epoch": 0.209088, + "grad_norm": 0.84375, + "learning_rate": 7.973064516129033e-05, + "loss": 0.1381, + "step": 13068 + }, + { + "epoch": 0.209104, + "grad_norm": 0.6640625, + "learning_rate": 7.972903225806451e-05, + "loss": 0.1772, + "step": 13069 + }, + { + "epoch": 0.20912, + "grad_norm": 0.8828125, + "learning_rate": 7.972741935483871e-05, + "loss": 0.1869, + "step": 13070 + }, + { + "epoch": 0.209136, + "grad_norm": 0.79296875, + "learning_rate": 7.97258064516129e-05, + "loss": 0.1726, + "step": 13071 + }, + { + "epoch": 0.209152, + "grad_norm": 0.6796875, + "learning_rate": 7.97241935483871e-05, + "loss": 0.1511, + "step": 13072 + }, + { + "epoch": 0.209168, + "grad_norm": 0.765625, + "learning_rate": 7.97225806451613e-05, + "loss": 0.216, + "step": 13073 + }, + { + "epoch": 0.209184, + "grad_norm": 0.93359375, + "learning_rate": 7.97209677419355e-05, + "loss": 0.2015, + "step": 13074 + }, + { + "epoch": 0.2092, + "grad_norm": 0.88671875, + "learning_rate": 7.971935483870969e-05, + "loss": 0.1858, + "step": 13075 + }, + { + "epoch": 0.209216, + "grad_norm": 1.9765625, + "learning_rate": 7.971774193548387e-05, + "loss": 0.202, + "step": 13076 + }, + { + "epoch": 0.209232, + "grad_norm": 0.5625, + "learning_rate": 7.971612903225807e-05, + "loss": 0.1623, + "step": 13077 + }, + { + "epoch": 0.209248, + "grad_norm": 1.0390625, + "learning_rate": 7.971451612903226e-05, + "loss": 0.1699, + "step": 13078 + }, + { + "epoch": 0.209264, + "grad_norm": 0.73046875, + "learning_rate": 7.971290322580646e-05, + "loss": 0.1914, + "step": 13079 + }, + { + "epoch": 0.20928, + "grad_norm": 0.81640625, + "learning_rate": 7.971129032258064e-05, + "loss": 0.1789, + "step": 13080 + }, + { + "epoch": 0.209296, + "grad_norm": 0.6640625, + "learning_rate": 7.970967741935484e-05, + "loss": 0.1155, + "step": 13081 + }, + { + "epoch": 0.209312, + "grad_norm": 0.734375, + "learning_rate": 7.970806451612903e-05, + "loss": 0.1665, + "step": 13082 + }, + { + "epoch": 0.209328, + "grad_norm": 0.69921875, + "learning_rate": 7.970645161290323e-05, + "loss": 0.1455, + "step": 13083 + }, + { + "epoch": 0.209344, + "grad_norm": 0.87890625, + "learning_rate": 7.970483870967743e-05, + "loss": 0.1746, + "step": 13084 + }, + { + "epoch": 0.20936, + "grad_norm": 0.9453125, + "learning_rate": 7.970322580645161e-05, + "loss": 0.1901, + "step": 13085 + }, + { + "epoch": 0.209376, + "grad_norm": 1.125, + "learning_rate": 7.970161290322581e-05, + "loss": 0.1664, + "step": 13086 + }, + { + "epoch": 0.209392, + "grad_norm": 0.72265625, + "learning_rate": 7.970000000000001e-05, + "loss": 0.1581, + "step": 13087 + }, + { + "epoch": 0.209408, + "grad_norm": 0.87109375, + "learning_rate": 7.96983870967742e-05, + "loss": 0.1736, + "step": 13088 + }, + { + "epoch": 0.209424, + "grad_norm": 0.796875, + "learning_rate": 7.96967741935484e-05, + "loss": 0.1931, + "step": 13089 + }, + { + "epoch": 0.20944, + "grad_norm": 0.92578125, + "learning_rate": 7.969516129032258e-05, + "loss": 0.1573, + "step": 13090 + }, + { + "epoch": 0.209456, + "grad_norm": 0.69140625, + "learning_rate": 7.969354838709677e-05, + "loss": 0.1406, + "step": 13091 + }, + { + "epoch": 0.209472, + "grad_norm": 1.015625, + "learning_rate": 7.969193548387097e-05, + "loss": 0.1547, + "step": 13092 + }, + { + "epoch": 0.209488, + "grad_norm": 1.140625, + "learning_rate": 7.969032258064516e-05, + "loss": 0.1758, + "step": 13093 + }, + { + "epoch": 0.209504, + "grad_norm": 2.21875, + "learning_rate": 7.968870967741936e-05, + "loss": 0.1958, + "step": 13094 + }, + { + "epoch": 0.20952, + "grad_norm": 0.953125, + "learning_rate": 7.968709677419354e-05, + "loss": 0.1832, + "step": 13095 + }, + { + "epoch": 0.209536, + "grad_norm": 0.7890625, + "learning_rate": 7.968548387096774e-05, + "loss": 0.1684, + "step": 13096 + }, + { + "epoch": 0.209552, + "grad_norm": 0.59765625, + "learning_rate": 7.968387096774194e-05, + "loss": 0.2013, + "step": 13097 + }, + { + "epoch": 0.209568, + "grad_norm": 0.86328125, + "learning_rate": 7.968225806451614e-05, + "loss": 0.1528, + "step": 13098 + }, + { + "epoch": 0.209584, + "grad_norm": 1.7265625, + "learning_rate": 7.968064516129033e-05, + "loss": 0.1842, + "step": 13099 + }, + { + "epoch": 0.2096, + "grad_norm": 1.2109375, + "learning_rate": 7.967903225806453e-05, + "loss": 0.189, + "step": 13100 + }, + { + "epoch": 0.209616, + "grad_norm": 0.73046875, + "learning_rate": 7.967741935483871e-05, + "loss": 0.1539, + "step": 13101 + }, + { + "epoch": 0.209632, + "grad_norm": 0.890625, + "learning_rate": 7.967580645161291e-05, + "loss": 0.1489, + "step": 13102 + }, + { + "epoch": 0.209648, + "grad_norm": 1.3515625, + "learning_rate": 7.96741935483871e-05, + "loss": 0.1464, + "step": 13103 + }, + { + "epoch": 0.209664, + "grad_norm": 0.69140625, + "learning_rate": 7.96725806451613e-05, + "loss": 0.1344, + "step": 13104 + }, + { + "epoch": 0.20968, + "grad_norm": 0.8046875, + "learning_rate": 7.967096774193548e-05, + "loss": 0.1804, + "step": 13105 + }, + { + "epoch": 0.209696, + "grad_norm": 0.6640625, + "learning_rate": 7.966935483870968e-05, + "loss": 0.2109, + "step": 13106 + }, + { + "epoch": 0.209712, + "grad_norm": 0.98828125, + "learning_rate": 7.966774193548387e-05, + "loss": 0.1994, + "step": 13107 + }, + { + "epoch": 0.209728, + "grad_norm": 1.1015625, + "learning_rate": 7.966612903225807e-05, + "loss": 0.1702, + "step": 13108 + }, + { + "epoch": 0.209744, + "grad_norm": 0.9296875, + "learning_rate": 7.966451612903227e-05, + "loss": 0.1298, + "step": 13109 + }, + { + "epoch": 0.20976, + "grad_norm": 0.75, + "learning_rate": 7.966290322580646e-05, + "loss": 0.1749, + "step": 13110 + }, + { + "epoch": 0.209776, + "grad_norm": 0.6953125, + "learning_rate": 7.966129032258065e-05, + "loss": 0.1321, + "step": 13111 + }, + { + "epoch": 0.209792, + "grad_norm": 1.234375, + "learning_rate": 7.965967741935484e-05, + "loss": 0.1478, + "step": 13112 + }, + { + "epoch": 0.209808, + "grad_norm": 0.9140625, + "learning_rate": 7.965806451612904e-05, + "loss": 0.1789, + "step": 13113 + }, + { + "epoch": 0.209824, + "grad_norm": 0.74609375, + "learning_rate": 7.965645161290323e-05, + "loss": 0.1497, + "step": 13114 + }, + { + "epoch": 0.20984, + "grad_norm": 0.6171875, + "learning_rate": 7.965483870967743e-05, + "loss": 0.187, + "step": 13115 + }, + { + "epoch": 0.209856, + "grad_norm": 0.625, + "learning_rate": 7.965322580645161e-05, + "loss": 0.1732, + "step": 13116 + }, + { + "epoch": 0.209872, + "grad_norm": 1.3125, + "learning_rate": 7.965161290322581e-05, + "loss": 0.1956, + "step": 13117 + }, + { + "epoch": 0.209888, + "grad_norm": 0.75390625, + "learning_rate": 7.965e-05, + "loss": 0.1881, + "step": 13118 + }, + { + "epoch": 0.209904, + "grad_norm": 1.515625, + "learning_rate": 7.96483870967742e-05, + "loss": 0.1679, + "step": 13119 + }, + { + "epoch": 0.20992, + "grad_norm": 1.71875, + "learning_rate": 7.964677419354838e-05, + "loss": 0.1764, + "step": 13120 + }, + { + "epoch": 0.209936, + "grad_norm": 1.0078125, + "learning_rate": 7.964516129032258e-05, + "loss": 0.1909, + "step": 13121 + }, + { + "epoch": 0.209952, + "grad_norm": 0.7734375, + "learning_rate": 7.964354838709678e-05, + "loss": 0.1826, + "step": 13122 + }, + { + "epoch": 0.209968, + "grad_norm": 0.7734375, + "learning_rate": 7.964193548387097e-05, + "loss": 0.1773, + "step": 13123 + }, + { + "epoch": 0.209984, + "grad_norm": 0.9921875, + "learning_rate": 7.964032258064517e-05, + "loss": 0.1723, + "step": 13124 + }, + { + "epoch": 0.21, + "grad_norm": 0.97265625, + "learning_rate": 7.963870967741935e-05, + "loss": 0.2104, + "step": 13125 + }, + { + "epoch": 0.210016, + "grad_norm": 0.66015625, + "learning_rate": 7.963709677419355e-05, + "loss": 0.1399, + "step": 13126 + }, + { + "epoch": 0.210032, + "grad_norm": 0.71875, + "learning_rate": 7.963548387096774e-05, + "loss": 0.1155, + "step": 13127 + }, + { + "epoch": 0.210048, + "grad_norm": 0.8125, + "learning_rate": 7.963387096774194e-05, + "loss": 0.1921, + "step": 13128 + }, + { + "epoch": 0.210064, + "grad_norm": 0.6015625, + "learning_rate": 7.963225806451613e-05, + "loss": 0.1817, + "step": 13129 + }, + { + "epoch": 0.21008, + "grad_norm": 0.75, + "learning_rate": 7.963064516129033e-05, + "loss": 0.1821, + "step": 13130 + }, + { + "epoch": 0.210096, + "grad_norm": 0.6953125, + "learning_rate": 7.962903225806451e-05, + "loss": 0.1283, + "step": 13131 + }, + { + "epoch": 0.210112, + "grad_norm": 0.66015625, + "learning_rate": 7.962741935483871e-05, + "loss": 0.1528, + "step": 13132 + }, + { + "epoch": 0.210128, + "grad_norm": 0.6015625, + "learning_rate": 7.962580645161291e-05, + "loss": 0.1525, + "step": 13133 + }, + { + "epoch": 0.210144, + "grad_norm": 0.86328125, + "learning_rate": 7.962419354838711e-05, + "loss": 0.1798, + "step": 13134 + }, + { + "epoch": 0.21016, + "grad_norm": 0.5546875, + "learning_rate": 7.96225806451613e-05, + "loss": 0.133, + "step": 13135 + }, + { + "epoch": 0.210176, + "grad_norm": 0.76953125, + "learning_rate": 7.96209677419355e-05, + "loss": 0.1964, + "step": 13136 + }, + { + "epoch": 0.210192, + "grad_norm": 0.890625, + "learning_rate": 7.961935483870968e-05, + "loss": 0.1969, + "step": 13137 + }, + { + "epoch": 0.210208, + "grad_norm": 0.59765625, + "learning_rate": 7.961774193548387e-05, + "loss": 0.1823, + "step": 13138 + }, + { + "epoch": 0.210224, + "grad_norm": 0.65625, + "learning_rate": 7.961612903225807e-05, + "loss": 0.1361, + "step": 13139 + }, + { + "epoch": 0.21024, + "grad_norm": 0.58203125, + "learning_rate": 7.961451612903225e-05, + "loss": 0.1792, + "step": 13140 + }, + { + "epoch": 0.210256, + "grad_norm": 0.70703125, + "learning_rate": 7.961290322580645e-05, + "loss": 0.1897, + "step": 13141 + }, + { + "epoch": 0.210272, + "grad_norm": 0.83984375, + "learning_rate": 7.961129032258064e-05, + "loss": 0.1519, + "step": 13142 + }, + { + "epoch": 0.210288, + "grad_norm": 0.7421875, + "learning_rate": 7.960967741935484e-05, + "loss": 0.1945, + "step": 13143 + }, + { + "epoch": 0.210304, + "grad_norm": 0.8046875, + "learning_rate": 7.960806451612904e-05, + "loss": 0.2006, + "step": 13144 + }, + { + "epoch": 0.21032, + "grad_norm": 0.8984375, + "learning_rate": 7.960645161290324e-05, + "loss": 0.1583, + "step": 13145 + }, + { + "epoch": 0.210336, + "grad_norm": 0.65234375, + "learning_rate": 7.960483870967743e-05, + "loss": 0.1745, + "step": 13146 + }, + { + "epoch": 0.210352, + "grad_norm": 0.52734375, + "learning_rate": 7.960322580645162e-05, + "loss": 0.1762, + "step": 13147 + }, + { + "epoch": 0.210368, + "grad_norm": 0.62109375, + "learning_rate": 7.960161290322581e-05, + "loss": 0.1566, + "step": 13148 + }, + { + "epoch": 0.210384, + "grad_norm": 0.76171875, + "learning_rate": 7.960000000000001e-05, + "loss": 0.1699, + "step": 13149 + }, + { + "epoch": 0.2104, + "grad_norm": 0.6171875, + "learning_rate": 7.95983870967742e-05, + "loss": 0.1567, + "step": 13150 + }, + { + "epoch": 0.210416, + "grad_norm": 0.8984375, + "learning_rate": 7.95967741935484e-05, + "loss": 0.1587, + "step": 13151 + }, + { + "epoch": 0.210432, + "grad_norm": 0.859375, + "learning_rate": 7.959516129032258e-05, + "loss": 0.1808, + "step": 13152 + }, + { + "epoch": 0.210448, + "grad_norm": 0.703125, + "learning_rate": 7.959354838709678e-05, + "loss": 0.1845, + "step": 13153 + }, + { + "epoch": 0.210464, + "grad_norm": 1.0, + "learning_rate": 7.959193548387097e-05, + "loss": 0.2037, + "step": 13154 + }, + { + "epoch": 0.21048, + "grad_norm": 0.81640625, + "learning_rate": 7.959032258064515e-05, + "loss": 0.1483, + "step": 13155 + }, + { + "epoch": 0.210496, + "grad_norm": 0.76953125, + "learning_rate": 7.958870967741935e-05, + "loss": 0.1727, + "step": 13156 + }, + { + "epoch": 0.210512, + "grad_norm": 1.296875, + "learning_rate": 7.958709677419355e-05, + "loss": 0.1741, + "step": 13157 + }, + { + "epoch": 0.210528, + "grad_norm": 1.046875, + "learning_rate": 7.958548387096775e-05, + "loss": 0.1963, + "step": 13158 + }, + { + "epoch": 0.210544, + "grad_norm": 0.6484375, + "learning_rate": 7.958387096774194e-05, + "loss": 0.1597, + "step": 13159 + }, + { + "epoch": 0.21056, + "grad_norm": 0.8359375, + "learning_rate": 7.958225806451614e-05, + "loss": 0.1732, + "step": 13160 + }, + { + "epoch": 0.210576, + "grad_norm": 0.83203125, + "learning_rate": 7.958064516129032e-05, + "loss": 0.1627, + "step": 13161 + }, + { + "epoch": 0.210592, + "grad_norm": 0.890625, + "learning_rate": 7.957903225806452e-05, + "loss": 0.1272, + "step": 13162 + }, + { + "epoch": 0.210608, + "grad_norm": 0.79296875, + "learning_rate": 7.957741935483871e-05, + "loss": 0.2051, + "step": 13163 + }, + { + "epoch": 0.210624, + "grad_norm": 0.75390625, + "learning_rate": 7.957580645161291e-05, + "loss": 0.1752, + "step": 13164 + }, + { + "epoch": 0.21064, + "grad_norm": 0.81640625, + "learning_rate": 7.95741935483871e-05, + "loss": 0.1837, + "step": 13165 + }, + { + "epoch": 0.210656, + "grad_norm": 0.7421875, + "learning_rate": 7.95725806451613e-05, + "loss": 0.1798, + "step": 13166 + }, + { + "epoch": 0.210672, + "grad_norm": 0.91796875, + "learning_rate": 7.957096774193548e-05, + "loss": 0.2077, + "step": 13167 + }, + { + "epoch": 0.210688, + "grad_norm": 1.5546875, + "learning_rate": 7.956935483870968e-05, + "loss": 0.1979, + "step": 13168 + }, + { + "epoch": 0.210704, + "grad_norm": 0.63671875, + "learning_rate": 7.956774193548388e-05, + "loss": 0.1769, + "step": 13169 + }, + { + "epoch": 0.21072, + "grad_norm": 1.1328125, + "learning_rate": 7.956612903225807e-05, + "loss": 0.2029, + "step": 13170 + }, + { + "epoch": 0.210736, + "grad_norm": 0.87109375, + "learning_rate": 7.956451612903227e-05, + "loss": 0.2014, + "step": 13171 + }, + { + "epoch": 0.210752, + "grad_norm": 1.09375, + "learning_rate": 7.956290322580645e-05, + "loss": 0.1964, + "step": 13172 + }, + { + "epoch": 0.210768, + "grad_norm": 0.80078125, + "learning_rate": 7.956129032258065e-05, + "loss": 0.1637, + "step": 13173 + }, + { + "epoch": 0.210784, + "grad_norm": 0.72265625, + "learning_rate": 7.955967741935484e-05, + "loss": 0.1921, + "step": 13174 + }, + { + "epoch": 0.2108, + "grad_norm": 0.76171875, + "learning_rate": 7.955806451612904e-05, + "loss": 0.1632, + "step": 13175 + }, + { + "epoch": 0.210816, + "grad_norm": 0.67578125, + "learning_rate": 7.955645161290322e-05, + "loss": 0.1662, + "step": 13176 + }, + { + "epoch": 0.210832, + "grad_norm": 0.8125, + "learning_rate": 7.955483870967742e-05, + "loss": 0.1703, + "step": 13177 + }, + { + "epoch": 0.210848, + "grad_norm": 1.390625, + "learning_rate": 7.955322580645161e-05, + "loss": 0.2124, + "step": 13178 + }, + { + "epoch": 0.210864, + "grad_norm": 0.796875, + "learning_rate": 7.955161290322581e-05, + "loss": 0.1974, + "step": 13179 + }, + { + "epoch": 0.21088, + "grad_norm": 0.640625, + "learning_rate": 7.955e-05, + "loss": 0.1612, + "step": 13180 + }, + { + "epoch": 0.210896, + "grad_norm": 0.89453125, + "learning_rate": 7.95483870967742e-05, + "loss": 0.1629, + "step": 13181 + }, + { + "epoch": 0.210912, + "grad_norm": 0.83984375, + "learning_rate": 7.95467741935484e-05, + "loss": 0.1834, + "step": 13182 + }, + { + "epoch": 0.210928, + "grad_norm": 0.71484375, + "learning_rate": 7.95451612903226e-05, + "loss": 0.1873, + "step": 13183 + }, + { + "epoch": 0.210944, + "grad_norm": 0.484375, + "learning_rate": 7.954354838709678e-05, + "loss": 0.145, + "step": 13184 + }, + { + "epoch": 0.21096, + "grad_norm": 0.8515625, + "learning_rate": 7.954193548387097e-05, + "loss": 0.173, + "step": 13185 + }, + { + "epoch": 0.210976, + "grad_norm": 0.72265625, + "learning_rate": 7.954032258064517e-05, + "loss": 0.1774, + "step": 13186 + }, + { + "epoch": 0.210992, + "grad_norm": 0.5859375, + "learning_rate": 7.953870967741935e-05, + "loss": 0.1819, + "step": 13187 + }, + { + "epoch": 0.211008, + "grad_norm": 0.703125, + "learning_rate": 7.953709677419355e-05, + "loss": 0.1858, + "step": 13188 + }, + { + "epoch": 0.211024, + "grad_norm": 0.8671875, + "learning_rate": 7.953548387096774e-05, + "loss": 0.1356, + "step": 13189 + }, + { + "epoch": 0.21104, + "grad_norm": 1.0703125, + "learning_rate": 7.953387096774194e-05, + "loss": 0.1546, + "step": 13190 + }, + { + "epoch": 0.211056, + "grad_norm": 0.6328125, + "learning_rate": 7.953225806451612e-05, + "loss": 0.1984, + "step": 13191 + }, + { + "epoch": 0.211072, + "grad_norm": 1.828125, + "learning_rate": 7.953064516129032e-05, + "loss": 0.1994, + "step": 13192 + }, + { + "epoch": 0.211088, + "grad_norm": 1.15625, + "learning_rate": 7.952903225806452e-05, + "loss": 0.1806, + "step": 13193 + }, + { + "epoch": 0.211104, + "grad_norm": 0.8984375, + "learning_rate": 7.952741935483872e-05, + "loss": 0.1759, + "step": 13194 + }, + { + "epoch": 0.21112, + "grad_norm": 0.8828125, + "learning_rate": 7.952580645161291e-05, + "loss": 0.1635, + "step": 13195 + }, + { + "epoch": 0.211136, + "grad_norm": 0.64453125, + "learning_rate": 7.952419354838711e-05, + "loss": 0.1914, + "step": 13196 + }, + { + "epoch": 0.211152, + "grad_norm": 1.8671875, + "learning_rate": 7.95225806451613e-05, + "loss": 0.1628, + "step": 13197 + }, + { + "epoch": 0.211168, + "grad_norm": 0.6640625, + "learning_rate": 7.95209677419355e-05, + "loss": 0.1872, + "step": 13198 + }, + { + "epoch": 0.211184, + "grad_norm": 0.8203125, + "learning_rate": 7.951935483870968e-05, + "loss": 0.2294, + "step": 13199 + }, + { + "epoch": 0.2112, + "grad_norm": 0.55078125, + "learning_rate": 7.951774193548387e-05, + "loss": 0.1791, + "step": 13200 + }, + { + "epoch": 0.211216, + "grad_norm": 0.7734375, + "learning_rate": 7.951612903225807e-05, + "loss": 0.172, + "step": 13201 + }, + { + "epoch": 0.211232, + "grad_norm": 1.0390625, + "learning_rate": 7.951451612903225e-05, + "loss": 0.1523, + "step": 13202 + }, + { + "epoch": 0.211248, + "grad_norm": 0.8359375, + "learning_rate": 7.951290322580645e-05, + "loss": 0.1803, + "step": 13203 + }, + { + "epoch": 0.211264, + "grad_norm": 0.72265625, + "learning_rate": 7.951129032258065e-05, + "loss": 0.1772, + "step": 13204 + }, + { + "epoch": 0.21128, + "grad_norm": 0.73046875, + "learning_rate": 7.950967741935485e-05, + "loss": 0.1771, + "step": 13205 + }, + { + "epoch": 0.211296, + "grad_norm": 0.90234375, + "learning_rate": 7.950806451612904e-05, + "loss": 0.1841, + "step": 13206 + }, + { + "epoch": 0.211312, + "grad_norm": 0.56640625, + "learning_rate": 7.950645161290324e-05, + "loss": 0.1673, + "step": 13207 + }, + { + "epoch": 0.211328, + "grad_norm": 0.9375, + "learning_rate": 7.950483870967742e-05, + "loss": 0.1844, + "step": 13208 + }, + { + "epoch": 0.211344, + "grad_norm": 0.6796875, + "learning_rate": 7.950322580645162e-05, + "loss": 0.156, + "step": 13209 + }, + { + "epoch": 0.21136, + "grad_norm": 0.5625, + "learning_rate": 7.950161290322581e-05, + "loss": 0.1433, + "step": 13210 + }, + { + "epoch": 0.211376, + "grad_norm": 0.66796875, + "learning_rate": 7.950000000000001e-05, + "loss": 0.1823, + "step": 13211 + }, + { + "epoch": 0.211392, + "grad_norm": 0.6328125, + "learning_rate": 7.94983870967742e-05, + "loss": 0.1895, + "step": 13212 + }, + { + "epoch": 0.211408, + "grad_norm": 0.6328125, + "learning_rate": 7.94967741935484e-05, + "loss": 0.1818, + "step": 13213 + }, + { + "epoch": 0.211424, + "grad_norm": 0.6875, + "learning_rate": 7.949516129032258e-05, + "loss": 0.1799, + "step": 13214 + }, + { + "epoch": 0.21144, + "grad_norm": 0.69921875, + "learning_rate": 7.949354838709678e-05, + "loss": 0.1646, + "step": 13215 + }, + { + "epoch": 0.211456, + "grad_norm": 0.8046875, + "learning_rate": 7.949193548387097e-05, + "loss": 0.1571, + "step": 13216 + }, + { + "epoch": 0.211472, + "grad_norm": 0.9296875, + "learning_rate": 7.949032258064517e-05, + "loss": 0.1809, + "step": 13217 + }, + { + "epoch": 0.211488, + "grad_norm": 0.60546875, + "learning_rate": 7.948870967741936e-05, + "loss": 0.2, + "step": 13218 + }, + { + "epoch": 0.211504, + "grad_norm": 0.6015625, + "learning_rate": 7.948709677419355e-05, + "loss": 0.1431, + "step": 13219 + }, + { + "epoch": 0.21152, + "grad_norm": 1.125, + "learning_rate": 7.948548387096775e-05, + "loss": 0.2132, + "step": 13220 + }, + { + "epoch": 0.211536, + "grad_norm": 0.92578125, + "learning_rate": 7.948387096774194e-05, + "loss": 0.1775, + "step": 13221 + }, + { + "epoch": 0.211552, + "grad_norm": 0.73828125, + "learning_rate": 7.948225806451614e-05, + "loss": 0.1993, + "step": 13222 + }, + { + "epoch": 0.211568, + "grad_norm": 0.90234375, + "learning_rate": 7.948064516129032e-05, + "loss": 0.1794, + "step": 13223 + }, + { + "epoch": 0.211584, + "grad_norm": 0.78515625, + "learning_rate": 7.947903225806452e-05, + "loss": 0.1872, + "step": 13224 + }, + { + "epoch": 0.2116, + "grad_norm": 0.78125, + "learning_rate": 7.947741935483871e-05, + "loss": 0.1294, + "step": 13225 + }, + { + "epoch": 0.211616, + "grad_norm": 0.90625, + "learning_rate": 7.947580645161291e-05, + "loss": 0.1996, + "step": 13226 + }, + { + "epoch": 0.211632, + "grad_norm": 0.74609375, + "learning_rate": 7.94741935483871e-05, + "loss": 0.1745, + "step": 13227 + }, + { + "epoch": 0.211648, + "grad_norm": 0.81640625, + "learning_rate": 7.94725806451613e-05, + "loss": 0.1897, + "step": 13228 + }, + { + "epoch": 0.211664, + "grad_norm": 0.86328125, + "learning_rate": 7.947096774193549e-05, + "loss": 0.2069, + "step": 13229 + }, + { + "epoch": 0.21168, + "grad_norm": 0.69140625, + "learning_rate": 7.946935483870969e-05, + "loss": 0.1871, + "step": 13230 + }, + { + "epoch": 0.211696, + "grad_norm": 0.61328125, + "learning_rate": 7.946774193548388e-05, + "loss": 0.1706, + "step": 13231 + }, + { + "epoch": 0.211712, + "grad_norm": 0.8046875, + "learning_rate": 7.946612903225806e-05, + "loss": 0.1887, + "step": 13232 + }, + { + "epoch": 0.211728, + "grad_norm": 0.76171875, + "learning_rate": 7.946451612903226e-05, + "loss": 0.193, + "step": 13233 + }, + { + "epoch": 0.211744, + "grad_norm": 0.921875, + "learning_rate": 7.946290322580645e-05, + "loss": 0.1487, + "step": 13234 + }, + { + "epoch": 0.21176, + "grad_norm": 0.8203125, + "learning_rate": 7.946129032258065e-05, + "loss": 0.1505, + "step": 13235 + }, + { + "epoch": 0.211776, + "grad_norm": 0.9765625, + "learning_rate": 7.945967741935484e-05, + "loss": 0.15, + "step": 13236 + }, + { + "epoch": 0.211792, + "grad_norm": 1.0234375, + "learning_rate": 7.945806451612904e-05, + "loss": 0.1557, + "step": 13237 + }, + { + "epoch": 0.211808, + "grad_norm": 0.73828125, + "learning_rate": 7.945645161290322e-05, + "loss": 0.1997, + "step": 13238 + }, + { + "epoch": 0.211824, + "grad_norm": 0.87890625, + "learning_rate": 7.945483870967742e-05, + "loss": 0.2028, + "step": 13239 + }, + { + "epoch": 0.21184, + "grad_norm": 1.0078125, + "learning_rate": 7.945322580645162e-05, + "loss": 0.2035, + "step": 13240 + }, + { + "epoch": 0.211856, + "grad_norm": 0.87890625, + "learning_rate": 7.945161290322581e-05, + "loss": 0.2054, + "step": 13241 + }, + { + "epoch": 0.211872, + "grad_norm": 1.1015625, + "learning_rate": 7.945e-05, + "loss": 0.2268, + "step": 13242 + }, + { + "epoch": 0.211888, + "grad_norm": 0.57421875, + "learning_rate": 7.94483870967742e-05, + "loss": 0.1893, + "step": 13243 + }, + { + "epoch": 0.211904, + "grad_norm": 0.91015625, + "learning_rate": 7.944677419354839e-05, + "loss": 0.1865, + "step": 13244 + }, + { + "epoch": 0.21192, + "grad_norm": 0.6875, + "learning_rate": 7.944516129032259e-05, + "loss": 0.1638, + "step": 13245 + }, + { + "epoch": 0.211936, + "grad_norm": 1.234375, + "learning_rate": 7.944354838709678e-05, + "loss": 0.1513, + "step": 13246 + }, + { + "epoch": 0.211952, + "grad_norm": 0.734375, + "learning_rate": 7.944193548387096e-05, + "loss": 0.1896, + "step": 13247 + }, + { + "epoch": 0.211968, + "grad_norm": 0.734375, + "learning_rate": 7.944032258064516e-05, + "loss": 0.186, + "step": 13248 + }, + { + "epoch": 0.211984, + "grad_norm": 0.58203125, + "learning_rate": 7.943870967741935e-05, + "loss": 0.1495, + "step": 13249 + }, + { + "epoch": 0.212, + "grad_norm": 0.56640625, + "learning_rate": 7.943709677419355e-05, + "loss": 0.1642, + "step": 13250 + }, + { + "epoch": 0.212016, + "grad_norm": 0.58984375, + "learning_rate": 7.943548387096774e-05, + "loss": 0.167, + "step": 13251 + }, + { + "epoch": 0.212032, + "grad_norm": 1.2109375, + "learning_rate": 7.943387096774194e-05, + "loss": 0.1513, + "step": 13252 + }, + { + "epoch": 0.212048, + "grad_norm": 0.5, + "learning_rate": 7.943225806451613e-05, + "loss": 0.143, + "step": 13253 + }, + { + "epoch": 0.212064, + "grad_norm": 0.92578125, + "learning_rate": 7.943064516129033e-05, + "loss": 0.1622, + "step": 13254 + }, + { + "epoch": 0.21208, + "grad_norm": 0.73828125, + "learning_rate": 7.942903225806452e-05, + "loss": 0.1689, + "step": 13255 + }, + { + "epoch": 0.212096, + "grad_norm": 1.1171875, + "learning_rate": 7.942741935483872e-05, + "loss": 0.1865, + "step": 13256 + }, + { + "epoch": 0.212112, + "grad_norm": 1.046875, + "learning_rate": 7.94258064516129e-05, + "loss": 0.1813, + "step": 13257 + }, + { + "epoch": 0.212128, + "grad_norm": 0.59765625, + "learning_rate": 7.94241935483871e-05, + "loss": 0.1563, + "step": 13258 + }, + { + "epoch": 0.212144, + "grad_norm": 0.78515625, + "learning_rate": 7.942258064516129e-05, + "loss": 0.163, + "step": 13259 + }, + { + "epoch": 0.21216, + "grad_norm": 0.73828125, + "learning_rate": 7.942096774193549e-05, + "loss": 0.1811, + "step": 13260 + }, + { + "epoch": 0.212176, + "grad_norm": 0.765625, + "learning_rate": 7.941935483870968e-05, + "loss": 0.2008, + "step": 13261 + }, + { + "epoch": 0.212192, + "grad_norm": 0.90234375, + "learning_rate": 7.941774193548388e-05, + "loss": 0.1867, + "step": 13262 + }, + { + "epoch": 0.212208, + "grad_norm": 0.796875, + "learning_rate": 7.941612903225806e-05, + "loss": 0.1711, + "step": 13263 + }, + { + "epoch": 0.212224, + "grad_norm": 0.77734375, + "learning_rate": 7.941451612903226e-05, + "loss": 0.174, + "step": 13264 + }, + { + "epoch": 0.21224, + "grad_norm": 0.7734375, + "learning_rate": 7.941290322580646e-05, + "loss": 0.1746, + "step": 13265 + }, + { + "epoch": 0.212256, + "grad_norm": 0.875, + "learning_rate": 7.941129032258065e-05, + "loss": 0.1232, + "step": 13266 + }, + { + "epoch": 0.212272, + "grad_norm": 0.6953125, + "learning_rate": 7.940967741935485e-05, + "loss": 0.1489, + "step": 13267 + }, + { + "epoch": 0.212288, + "grad_norm": 0.6484375, + "learning_rate": 7.940806451612903e-05, + "loss": 0.1389, + "step": 13268 + }, + { + "epoch": 0.212304, + "grad_norm": 1.0234375, + "learning_rate": 7.940645161290323e-05, + "loss": 0.1766, + "step": 13269 + }, + { + "epoch": 0.21232, + "grad_norm": 0.82421875, + "learning_rate": 7.940483870967742e-05, + "loss": 0.2064, + "step": 13270 + }, + { + "epoch": 0.212336, + "grad_norm": 0.98046875, + "learning_rate": 7.940322580645162e-05, + "loss": 0.1771, + "step": 13271 + }, + { + "epoch": 0.212352, + "grad_norm": 0.62109375, + "learning_rate": 7.94016129032258e-05, + "loss": 0.1619, + "step": 13272 + }, + { + "epoch": 0.212368, + "grad_norm": 1.0234375, + "learning_rate": 7.94e-05, + "loss": 0.1905, + "step": 13273 + }, + { + "epoch": 0.212384, + "grad_norm": 0.58984375, + "learning_rate": 7.939838709677419e-05, + "loss": 0.1648, + "step": 13274 + }, + { + "epoch": 0.2124, + "grad_norm": 0.6875, + "learning_rate": 7.939677419354839e-05, + "loss": 0.1385, + "step": 13275 + }, + { + "epoch": 0.212416, + "grad_norm": 0.96875, + "learning_rate": 7.939516129032258e-05, + "loss": 0.1689, + "step": 13276 + }, + { + "epoch": 0.212432, + "grad_norm": 1.046875, + "learning_rate": 7.939354838709678e-05, + "loss": 0.1587, + "step": 13277 + }, + { + "epoch": 0.212448, + "grad_norm": 1.8984375, + "learning_rate": 7.939193548387098e-05, + "loss": 0.1995, + "step": 13278 + }, + { + "epoch": 0.212464, + "grad_norm": 1.0546875, + "learning_rate": 7.939032258064516e-05, + "loss": 0.1387, + "step": 13279 + }, + { + "epoch": 0.21248, + "grad_norm": 0.83203125, + "learning_rate": 7.938870967741936e-05, + "loss": 0.1539, + "step": 13280 + }, + { + "epoch": 0.212496, + "grad_norm": 0.81640625, + "learning_rate": 7.938709677419355e-05, + "loss": 0.1641, + "step": 13281 + }, + { + "epoch": 0.212512, + "grad_norm": 0.73046875, + "learning_rate": 7.938548387096775e-05, + "loss": 0.1609, + "step": 13282 + }, + { + "epoch": 0.212528, + "grad_norm": 0.6875, + "learning_rate": 7.938387096774193e-05, + "loss": 0.1687, + "step": 13283 + }, + { + "epoch": 0.212544, + "grad_norm": 0.671875, + "learning_rate": 7.938225806451613e-05, + "loss": 0.1752, + "step": 13284 + }, + { + "epoch": 0.21256, + "grad_norm": 1.375, + "learning_rate": 7.938064516129032e-05, + "loss": 0.2033, + "step": 13285 + }, + { + "epoch": 0.212576, + "grad_norm": 0.6796875, + "learning_rate": 7.937903225806452e-05, + "loss": 0.1761, + "step": 13286 + }, + { + "epoch": 0.212592, + "grad_norm": 0.6015625, + "learning_rate": 7.93774193548387e-05, + "loss": 0.1207, + "step": 13287 + }, + { + "epoch": 0.212608, + "grad_norm": 0.66015625, + "learning_rate": 7.93758064516129e-05, + "loss": 0.1766, + "step": 13288 + }, + { + "epoch": 0.212624, + "grad_norm": 0.7265625, + "learning_rate": 7.93741935483871e-05, + "loss": 0.1707, + "step": 13289 + }, + { + "epoch": 0.21264, + "grad_norm": 0.68359375, + "learning_rate": 7.93725806451613e-05, + "loss": 0.1621, + "step": 13290 + }, + { + "epoch": 0.212656, + "grad_norm": 0.91015625, + "learning_rate": 7.937096774193549e-05, + "loss": 0.1917, + "step": 13291 + }, + { + "epoch": 0.212672, + "grad_norm": 0.68359375, + "learning_rate": 7.936935483870969e-05, + "loss": 0.1568, + "step": 13292 + }, + { + "epoch": 0.212688, + "grad_norm": 1.28125, + "learning_rate": 7.936774193548388e-05, + "loss": 0.1773, + "step": 13293 + }, + { + "epoch": 0.212704, + "grad_norm": 1.140625, + "learning_rate": 7.936612903225806e-05, + "loss": 0.213, + "step": 13294 + }, + { + "epoch": 0.21272, + "grad_norm": 0.6015625, + "learning_rate": 7.936451612903226e-05, + "loss": 0.1526, + "step": 13295 + }, + { + "epoch": 0.212736, + "grad_norm": 0.78515625, + "learning_rate": 7.936290322580645e-05, + "loss": 0.1731, + "step": 13296 + }, + { + "epoch": 0.212752, + "grad_norm": 1.1796875, + "learning_rate": 7.936129032258065e-05, + "loss": 0.1653, + "step": 13297 + }, + { + "epoch": 0.212768, + "grad_norm": 0.83984375, + "learning_rate": 7.935967741935483e-05, + "loss": 0.1828, + "step": 13298 + }, + { + "epoch": 0.212784, + "grad_norm": 0.6796875, + "learning_rate": 7.935806451612903e-05, + "loss": 0.1208, + "step": 13299 + }, + { + "epoch": 0.2128, + "grad_norm": 0.96875, + "learning_rate": 7.935645161290323e-05, + "loss": 0.2089, + "step": 13300 + }, + { + "epoch": 0.212816, + "grad_norm": 0.984375, + "learning_rate": 7.935483870967743e-05, + "loss": 0.1711, + "step": 13301 + }, + { + "epoch": 0.212832, + "grad_norm": 0.90625, + "learning_rate": 7.935322580645162e-05, + "loss": 0.1865, + "step": 13302 + }, + { + "epoch": 0.212848, + "grad_norm": 0.78125, + "learning_rate": 7.935161290322582e-05, + "loss": 0.162, + "step": 13303 + }, + { + "epoch": 0.212864, + "grad_norm": 0.984375, + "learning_rate": 7.935e-05, + "loss": 0.1846, + "step": 13304 + }, + { + "epoch": 0.21288, + "grad_norm": 0.84765625, + "learning_rate": 7.93483870967742e-05, + "loss": 0.1527, + "step": 13305 + }, + { + "epoch": 0.212896, + "grad_norm": 0.609375, + "learning_rate": 7.934677419354839e-05, + "loss": 0.1516, + "step": 13306 + }, + { + "epoch": 0.212912, + "grad_norm": 0.94140625, + "learning_rate": 7.934516129032259e-05, + "loss": 0.191, + "step": 13307 + }, + { + "epoch": 0.212928, + "grad_norm": 1.359375, + "learning_rate": 7.934354838709678e-05, + "loss": 0.1671, + "step": 13308 + }, + { + "epoch": 0.212944, + "grad_norm": 0.6796875, + "learning_rate": 7.934193548387096e-05, + "loss": 0.1799, + "step": 13309 + }, + { + "epoch": 0.21296, + "grad_norm": 0.5703125, + "learning_rate": 7.934032258064516e-05, + "loss": 0.1702, + "step": 13310 + }, + { + "epoch": 0.212976, + "grad_norm": 0.455078125, + "learning_rate": 7.933870967741935e-05, + "loss": 0.157, + "step": 13311 + }, + { + "epoch": 0.212992, + "grad_norm": 0.6015625, + "learning_rate": 7.933709677419355e-05, + "loss": 0.1781, + "step": 13312 + }, + { + "epoch": 0.213008, + "grad_norm": 0.76953125, + "learning_rate": 7.933548387096775e-05, + "loss": 0.1649, + "step": 13313 + }, + { + "epoch": 0.213024, + "grad_norm": 0.75, + "learning_rate": 7.933387096774195e-05, + "loss": 0.1717, + "step": 13314 + }, + { + "epoch": 0.21304, + "grad_norm": 0.78515625, + "learning_rate": 7.933225806451613e-05, + "loss": 0.1846, + "step": 13315 + }, + { + "epoch": 0.213056, + "grad_norm": 1.0859375, + "learning_rate": 7.933064516129033e-05, + "loss": 0.166, + "step": 13316 + }, + { + "epoch": 0.213072, + "grad_norm": 0.82421875, + "learning_rate": 7.932903225806452e-05, + "loss": 0.1938, + "step": 13317 + }, + { + "epoch": 0.213088, + "grad_norm": 0.81640625, + "learning_rate": 7.932741935483872e-05, + "loss": 0.1393, + "step": 13318 + }, + { + "epoch": 0.213104, + "grad_norm": 0.578125, + "learning_rate": 7.93258064516129e-05, + "loss": 0.1526, + "step": 13319 + }, + { + "epoch": 0.21312, + "grad_norm": 0.703125, + "learning_rate": 7.93241935483871e-05, + "loss": 0.1745, + "step": 13320 + }, + { + "epoch": 0.213136, + "grad_norm": 0.57421875, + "learning_rate": 7.932258064516129e-05, + "loss": 0.1522, + "step": 13321 + }, + { + "epoch": 0.213152, + "grad_norm": 0.4453125, + "learning_rate": 7.932096774193549e-05, + "loss": 0.1356, + "step": 13322 + }, + { + "epoch": 0.213168, + "grad_norm": 0.8046875, + "learning_rate": 7.931935483870968e-05, + "loss": 0.1661, + "step": 13323 + }, + { + "epoch": 0.213184, + "grad_norm": 0.625, + "learning_rate": 7.931774193548387e-05, + "loss": 0.1496, + "step": 13324 + }, + { + "epoch": 0.2132, + "grad_norm": 0.625, + "learning_rate": 7.931612903225807e-05, + "loss": 0.1985, + "step": 13325 + }, + { + "epoch": 0.213216, + "grad_norm": 0.515625, + "learning_rate": 7.931451612903226e-05, + "loss": 0.1506, + "step": 13326 + }, + { + "epoch": 0.213232, + "grad_norm": 0.578125, + "learning_rate": 7.931290322580646e-05, + "loss": 0.1644, + "step": 13327 + }, + { + "epoch": 0.213248, + "grad_norm": 0.8828125, + "learning_rate": 7.931129032258065e-05, + "loss": 0.2306, + "step": 13328 + }, + { + "epoch": 0.213264, + "grad_norm": 0.61328125, + "learning_rate": 7.930967741935485e-05, + "loss": 0.1556, + "step": 13329 + }, + { + "epoch": 0.21328, + "grad_norm": 0.81640625, + "learning_rate": 7.930806451612903e-05, + "loss": 0.1765, + "step": 13330 + }, + { + "epoch": 0.213296, + "grad_norm": 0.578125, + "learning_rate": 7.930645161290323e-05, + "loss": 0.1735, + "step": 13331 + }, + { + "epoch": 0.213312, + "grad_norm": 0.66015625, + "learning_rate": 7.930483870967742e-05, + "loss": 0.1692, + "step": 13332 + }, + { + "epoch": 0.213328, + "grad_norm": 0.70703125, + "learning_rate": 7.930322580645162e-05, + "loss": 0.148, + "step": 13333 + }, + { + "epoch": 0.213344, + "grad_norm": 0.6953125, + "learning_rate": 7.93016129032258e-05, + "loss": 0.1991, + "step": 13334 + }, + { + "epoch": 0.21336, + "grad_norm": 0.84375, + "learning_rate": 7.93e-05, + "loss": 0.1538, + "step": 13335 + }, + { + "epoch": 0.213376, + "grad_norm": 0.83984375, + "learning_rate": 7.92983870967742e-05, + "loss": 0.18, + "step": 13336 + }, + { + "epoch": 0.213392, + "grad_norm": 0.53515625, + "learning_rate": 7.929677419354839e-05, + "loss": 0.171, + "step": 13337 + }, + { + "epoch": 0.213408, + "grad_norm": 0.8203125, + "learning_rate": 7.929516129032259e-05, + "loss": 0.1835, + "step": 13338 + }, + { + "epoch": 0.213424, + "grad_norm": 0.69140625, + "learning_rate": 7.929354838709679e-05, + "loss": 0.1761, + "step": 13339 + }, + { + "epoch": 0.21344, + "grad_norm": 0.83984375, + "learning_rate": 7.929193548387097e-05, + "loss": 0.2053, + "step": 13340 + }, + { + "epoch": 0.213456, + "grad_norm": 0.9140625, + "learning_rate": 7.929032258064516e-05, + "loss": 0.1833, + "step": 13341 + }, + { + "epoch": 0.213472, + "grad_norm": 0.478515625, + "learning_rate": 7.928870967741936e-05, + "loss": 0.1446, + "step": 13342 + }, + { + "epoch": 0.213488, + "grad_norm": 0.62890625, + "learning_rate": 7.928709677419355e-05, + "loss": 0.1774, + "step": 13343 + }, + { + "epoch": 0.213504, + "grad_norm": 1.3671875, + "learning_rate": 7.928548387096775e-05, + "loss": 0.1865, + "step": 13344 + }, + { + "epoch": 0.21352, + "grad_norm": 0.8125, + "learning_rate": 7.928387096774193e-05, + "loss": 0.2035, + "step": 13345 + }, + { + "epoch": 0.213536, + "grad_norm": 1.0390625, + "learning_rate": 7.928225806451613e-05, + "loss": 0.2082, + "step": 13346 + }, + { + "epoch": 0.213552, + "grad_norm": 0.984375, + "learning_rate": 7.928064516129032e-05, + "loss": 0.1863, + "step": 13347 + }, + { + "epoch": 0.213568, + "grad_norm": 0.921875, + "learning_rate": 7.927903225806452e-05, + "loss": 0.1592, + "step": 13348 + }, + { + "epoch": 0.213584, + "grad_norm": 1.1484375, + "learning_rate": 7.927741935483872e-05, + "loss": 0.1514, + "step": 13349 + }, + { + "epoch": 0.2136, + "grad_norm": 0.6875, + "learning_rate": 7.927580645161292e-05, + "loss": 0.1354, + "step": 13350 + }, + { + "epoch": 0.213616, + "grad_norm": 0.5390625, + "learning_rate": 7.92741935483871e-05, + "loss": 0.1381, + "step": 13351 + }, + { + "epoch": 0.213632, + "grad_norm": 1.15625, + "learning_rate": 7.92725806451613e-05, + "loss": 0.1987, + "step": 13352 + }, + { + "epoch": 0.213648, + "grad_norm": 1.4375, + "learning_rate": 7.927096774193549e-05, + "loss": 0.1847, + "step": 13353 + }, + { + "epoch": 0.213664, + "grad_norm": 0.66796875, + "learning_rate": 7.926935483870969e-05, + "loss": 0.1622, + "step": 13354 + }, + { + "epoch": 0.21368, + "grad_norm": 0.46875, + "learning_rate": 7.926774193548387e-05, + "loss": 0.1468, + "step": 13355 + }, + { + "epoch": 0.213696, + "grad_norm": 0.59765625, + "learning_rate": 7.926612903225806e-05, + "loss": 0.1705, + "step": 13356 + }, + { + "epoch": 0.213712, + "grad_norm": 0.75390625, + "learning_rate": 7.926451612903226e-05, + "loss": 0.1654, + "step": 13357 + }, + { + "epoch": 0.213728, + "grad_norm": 0.6640625, + "learning_rate": 7.926290322580645e-05, + "loss": 0.1719, + "step": 13358 + }, + { + "epoch": 0.213744, + "grad_norm": 0.94140625, + "learning_rate": 7.926129032258065e-05, + "loss": 0.2143, + "step": 13359 + }, + { + "epoch": 0.21376, + "grad_norm": 0.75, + "learning_rate": 7.925967741935484e-05, + "loss": 0.1515, + "step": 13360 + }, + { + "epoch": 0.213776, + "grad_norm": 1.203125, + "learning_rate": 7.925806451612904e-05, + "loss": 0.1879, + "step": 13361 + }, + { + "epoch": 0.213792, + "grad_norm": 0.796875, + "learning_rate": 7.925645161290323e-05, + "loss": 0.2108, + "step": 13362 + }, + { + "epoch": 0.213808, + "grad_norm": 0.90625, + "learning_rate": 7.925483870967743e-05, + "loss": 0.1556, + "step": 13363 + }, + { + "epoch": 0.213824, + "grad_norm": 0.8984375, + "learning_rate": 7.925322580645162e-05, + "loss": 0.2035, + "step": 13364 + }, + { + "epoch": 0.21384, + "grad_norm": 0.8203125, + "learning_rate": 7.925161290322582e-05, + "loss": 0.1645, + "step": 13365 + }, + { + "epoch": 0.213856, + "grad_norm": 0.73046875, + "learning_rate": 7.925e-05, + "loss": 0.1953, + "step": 13366 + }, + { + "epoch": 0.213872, + "grad_norm": 0.65625, + "learning_rate": 7.92483870967742e-05, + "loss": 0.1907, + "step": 13367 + }, + { + "epoch": 0.213888, + "grad_norm": 0.8125, + "learning_rate": 7.924677419354839e-05, + "loss": 0.133, + "step": 13368 + }, + { + "epoch": 0.213904, + "grad_norm": 0.9375, + "learning_rate": 7.924516129032259e-05, + "loss": 0.2055, + "step": 13369 + }, + { + "epoch": 0.21392, + "grad_norm": 1.2734375, + "learning_rate": 7.924354838709677e-05, + "loss": 0.2235, + "step": 13370 + }, + { + "epoch": 0.213936, + "grad_norm": 0.6875, + "learning_rate": 7.924193548387096e-05, + "loss": 0.168, + "step": 13371 + }, + { + "epoch": 0.213952, + "grad_norm": 0.57421875, + "learning_rate": 7.924032258064516e-05, + "loss": 0.1434, + "step": 13372 + }, + { + "epoch": 0.213968, + "grad_norm": 0.6953125, + "learning_rate": 7.923870967741936e-05, + "loss": 0.1699, + "step": 13373 + }, + { + "epoch": 0.213984, + "grad_norm": 0.8984375, + "learning_rate": 7.923709677419356e-05, + "loss": 0.1471, + "step": 13374 + }, + { + "epoch": 0.214, + "grad_norm": 0.9140625, + "learning_rate": 7.923548387096774e-05, + "loss": 0.1703, + "step": 13375 + }, + { + "epoch": 0.214016, + "grad_norm": 0.55078125, + "learning_rate": 7.923387096774194e-05, + "loss": 0.1437, + "step": 13376 + }, + { + "epoch": 0.214032, + "grad_norm": 1.21875, + "learning_rate": 7.923225806451613e-05, + "loss": 0.1828, + "step": 13377 + }, + { + "epoch": 0.214048, + "grad_norm": 1.0, + "learning_rate": 7.923064516129033e-05, + "loss": 0.1459, + "step": 13378 + }, + { + "epoch": 0.214064, + "grad_norm": 0.74609375, + "learning_rate": 7.922903225806452e-05, + "loss": 0.1617, + "step": 13379 + }, + { + "epoch": 0.21408, + "grad_norm": 1.265625, + "learning_rate": 7.922741935483872e-05, + "loss": 0.2078, + "step": 13380 + }, + { + "epoch": 0.214096, + "grad_norm": 0.64453125, + "learning_rate": 7.92258064516129e-05, + "loss": 0.1313, + "step": 13381 + }, + { + "epoch": 0.214112, + "grad_norm": 0.9375, + "learning_rate": 7.92241935483871e-05, + "loss": 0.1697, + "step": 13382 + }, + { + "epoch": 0.214128, + "grad_norm": 0.93359375, + "learning_rate": 7.922258064516129e-05, + "loss": 0.1648, + "step": 13383 + }, + { + "epoch": 0.214144, + "grad_norm": 0.7734375, + "learning_rate": 7.922096774193549e-05, + "loss": 0.1723, + "step": 13384 + }, + { + "epoch": 0.21416, + "grad_norm": 0.40234375, + "learning_rate": 7.921935483870969e-05, + "loss": 0.1109, + "step": 13385 + }, + { + "epoch": 0.214176, + "grad_norm": 0.70703125, + "learning_rate": 7.921774193548389e-05, + "loss": 0.1526, + "step": 13386 + }, + { + "epoch": 0.214192, + "grad_norm": 1.0859375, + "learning_rate": 7.921612903225807e-05, + "loss": 0.1871, + "step": 13387 + }, + { + "epoch": 0.214208, + "grad_norm": 1.015625, + "learning_rate": 7.921451612903226e-05, + "loss": 0.183, + "step": 13388 + }, + { + "epoch": 0.214224, + "grad_norm": 1.1015625, + "learning_rate": 7.921290322580646e-05, + "loss": 0.1616, + "step": 13389 + }, + { + "epoch": 0.21424, + "grad_norm": 1.0546875, + "learning_rate": 7.921129032258064e-05, + "loss": 0.1917, + "step": 13390 + }, + { + "epoch": 0.214256, + "grad_norm": 1.40625, + "learning_rate": 7.920967741935484e-05, + "loss": 0.2198, + "step": 13391 + }, + { + "epoch": 0.214272, + "grad_norm": 0.8828125, + "learning_rate": 7.920806451612903e-05, + "loss": 0.2201, + "step": 13392 + }, + { + "epoch": 0.214288, + "grad_norm": 1.6796875, + "learning_rate": 7.920645161290323e-05, + "loss": 0.1789, + "step": 13393 + }, + { + "epoch": 0.214304, + "grad_norm": 0.6171875, + "learning_rate": 7.920483870967742e-05, + "loss": 0.2104, + "step": 13394 + }, + { + "epoch": 0.21432, + "grad_norm": 0.65625, + "learning_rate": 7.920322580645161e-05, + "loss": 0.1793, + "step": 13395 + }, + { + "epoch": 0.214336, + "grad_norm": 0.609375, + "learning_rate": 7.920161290322581e-05, + "loss": 0.1796, + "step": 13396 + }, + { + "epoch": 0.214352, + "grad_norm": 0.7890625, + "learning_rate": 7.920000000000001e-05, + "loss": 0.1984, + "step": 13397 + }, + { + "epoch": 0.214368, + "grad_norm": 0.6328125, + "learning_rate": 7.91983870967742e-05, + "loss": 0.1923, + "step": 13398 + }, + { + "epoch": 0.214384, + "grad_norm": 0.73828125, + "learning_rate": 7.91967741935484e-05, + "loss": 0.1757, + "step": 13399 + }, + { + "epoch": 0.2144, + "grad_norm": 1.1953125, + "learning_rate": 7.919516129032259e-05, + "loss": 0.1935, + "step": 13400 + }, + { + "epoch": 0.214416, + "grad_norm": 0.73046875, + "learning_rate": 7.919354838709679e-05, + "loss": 0.1763, + "step": 13401 + }, + { + "epoch": 0.214432, + "grad_norm": 0.79296875, + "learning_rate": 7.919193548387097e-05, + "loss": 0.1468, + "step": 13402 + }, + { + "epoch": 0.214448, + "grad_norm": 0.65625, + "learning_rate": 7.919032258064516e-05, + "loss": 0.1846, + "step": 13403 + }, + { + "epoch": 0.214464, + "grad_norm": 1.21875, + "learning_rate": 7.918870967741936e-05, + "loss": 0.1671, + "step": 13404 + }, + { + "epoch": 0.21448, + "grad_norm": 1.3359375, + "learning_rate": 7.918709677419354e-05, + "loss": 0.2024, + "step": 13405 + }, + { + "epoch": 0.214496, + "grad_norm": 0.828125, + "learning_rate": 7.918548387096774e-05, + "loss": 0.1717, + "step": 13406 + }, + { + "epoch": 0.214512, + "grad_norm": 0.93359375, + "learning_rate": 7.918387096774193e-05, + "loss": 0.1726, + "step": 13407 + }, + { + "epoch": 0.214528, + "grad_norm": 0.6171875, + "learning_rate": 7.918225806451613e-05, + "loss": 0.1553, + "step": 13408 + }, + { + "epoch": 0.214544, + "grad_norm": 0.74609375, + "learning_rate": 7.918064516129033e-05, + "loss": 0.1663, + "step": 13409 + }, + { + "epoch": 0.21456, + "grad_norm": 0.8359375, + "learning_rate": 7.917903225806453e-05, + "loss": 0.2375, + "step": 13410 + }, + { + "epoch": 0.214576, + "grad_norm": 0.890625, + "learning_rate": 7.917741935483871e-05, + "loss": 0.1613, + "step": 13411 + }, + { + "epoch": 0.214592, + "grad_norm": 0.53125, + "learning_rate": 7.917580645161291e-05, + "loss": 0.1584, + "step": 13412 + }, + { + "epoch": 0.214608, + "grad_norm": 0.9453125, + "learning_rate": 7.91741935483871e-05, + "loss": 0.2067, + "step": 13413 + }, + { + "epoch": 0.214624, + "grad_norm": 1.03125, + "learning_rate": 7.91725806451613e-05, + "loss": 0.1726, + "step": 13414 + }, + { + "epoch": 0.21464, + "grad_norm": 0.91015625, + "learning_rate": 7.917096774193549e-05, + "loss": 0.1836, + "step": 13415 + }, + { + "epoch": 0.214656, + "grad_norm": 0.70703125, + "learning_rate": 7.916935483870969e-05, + "loss": 0.1452, + "step": 13416 + }, + { + "epoch": 0.214672, + "grad_norm": 0.703125, + "learning_rate": 7.916774193548387e-05, + "loss": 0.1906, + "step": 13417 + }, + { + "epoch": 0.214688, + "grad_norm": 0.6953125, + "learning_rate": 7.916612903225806e-05, + "loss": 0.1646, + "step": 13418 + }, + { + "epoch": 0.214704, + "grad_norm": 0.78125, + "learning_rate": 7.916451612903226e-05, + "loss": 0.1892, + "step": 13419 + }, + { + "epoch": 0.21472, + "grad_norm": 0.703125, + "learning_rate": 7.916290322580646e-05, + "loss": 0.192, + "step": 13420 + }, + { + "epoch": 0.214736, + "grad_norm": 0.85546875, + "learning_rate": 7.916129032258066e-05, + "loss": 0.1795, + "step": 13421 + }, + { + "epoch": 0.214752, + "grad_norm": 0.703125, + "learning_rate": 7.915967741935484e-05, + "loss": 0.1969, + "step": 13422 + }, + { + "epoch": 0.214768, + "grad_norm": 1.359375, + "learning_rate": 7.915806451612904e-05, + "loss": 0.1762, + "step": 13423 + }, + { + "epoch": 0.214784, + "grad_norm": 0.8828125, + "learning_rate": 7.915645161290323e-05, + "loss": 0.1838, + "step": 13424 + }, + { + "epoch": 0.2148, + "grad_norm": 1.3125, + "learning_rate": 7.915483870967743e-05, + "loss": 0.1827, + "step": 13425 + }, + { + "epoch": 0.214816, + "grad_norm": 1.0546875, + "learning_rate": 7.915322580645161e-05, + "loss": 0.1363, + "step": 13426 + }, + { + "epoch": 0.214832, + "grad_norm": 0.796875, + "learning_rate": 7.915161290322581e-05, + "loss": 0.1517, + "step": 13427 + }, + { + "epoch": 0.214848, + "grad_norm": 0.734375, + "learning_rate": 7.915e-05, + "loss": 0.1311, + "step": 13428 + }, + { + "epoch": 0.214864, + "grad_norm": 0.9375, + "learning_rate": 7.91483870967742e-05, + "loss": 0.1415, + "step": 13429 + }, + { + "epoch": 0.21488, + "grad_norm": 1.21875, + "learning_rate": 7.914677419354839e-05, + "loss": 0.189, + "step": 13430 + }, + { + "epoch": 0.214896, + "grad_norm": 0.58203125, + "learning_rate": 7.914516129032258e-05, + "loss": 0.1921, + "step": 13431 + }, + { + "epoch": 0.214912, + "grad_norm": 1.6484375, + "learning_rate": 7.914354838709677e-05, + "loss": 0.189, + "step": 13432 + }, + { + "epoch": 0.214928, + "grad_norm": 0.76953125, + "learning_rate": 7.914193548387097e-05, + "loss": 0.1609, + "step": 13433 + }, + { + "epoch": 0.214944, + "grad_norm": 0.79296875, + "learning_rate": 7.914032258064517e-05, + "loss": 0.1826, + "step": 13434 + }, + { + "epoch": 0.21496, + "grad_norm": 0.74609375, + "learning_rate": 7.913870967741936e-05, + "loss": 0.1412, + "step": 13435 + }, + { + "epoch": 0.214976, + "grad_norm": 1.0859375, + "learning_rate": 7.913709677419356e-05, + "loss": 0.1541, + "step": 13436 + }, + { + "epoch": 0.214992, + "grad_norm": 0.62890625, + "learning_rate": 7.913548387096774e-05, + "loss": 0.1481, + "step": 13437 + }, + { + "epoch": 0.215008, + "grad_norm": 1.203125, + "learning_rate": 7.913387096774194e-05, + "loss": 0.1757, + "step": 13438 + }, + { + "epoch": 0.215024, + "grad_norm": 0.6484375, + "learning_rate": 7.913225806451613e-05, + "loss": 0.1703, + "step": 13439 + }, + { + "epoch": 0.21504, + "grad_norm": 0.86328125, + "learning_rate": 7.913064516129033e-05, + "loss": 0.2022, + "step": 13440 + }, + { + "epoch": 0.215056, + "grad_norm": 1.1015625, + "learning_rate": 7.912903225806451e-05, + "loss": 0.1644, + "step": 13441 + }, + { + "epoch": 0.215072, + "grad_norm": 0.6640625, + "learning_rate": 7.912741935483871e-05, + "loss": 0.1599, + "step": 13442 + }, + { + "epoch": 0.215088, + "grad_norm": 0.7421875, + "learning_rate": 7.91258064516129e-05, + "loss": 0.154, + "step": 13443 + }, + { + "epoch": 0.215104, + "grad_norm": 1.0390625, + "learning_rate": 7.91241935483871e-05, + "loss": 0.184, + "step": 13444 + }, + { + "epoch": 0.21512, + "grad_norm": 0.5703125, + "learning_rate": 7.91225806451613e-05, + "loss": 0.1361, + "step": 13445 + }, + { + "epoch": 0.215136, + "grad_norm": 0.62109375, + "learning_rate": 7.91209677419355e-05, + "loss": 0.1631, + "step": 13446 + }, + { + "epoch": 0.215152, + "grad_norm": 0.80859375, + "learning_rate": 7.911935483870968e-05, + "loss": 0.1773, + "step": 13447 + }, + { + "epoch": 0.215168, + "grad_norm": 0.83203125, + "learning_rate": 7.911774193548388e-05, + "loss": 0.1838, + "step": 13448 + }, + { + "epoch": 0.215184, + "grad_norm": 1.0078125, + "learning_rate": 7.911612903225807e-05, + "loss": 0.1839, + "step": 13449 + }, + { + "epoch": 0.2152, + "grad_norm": 0.78125, + "learning_rate": 7.911451612903226e-05, + "loss": 0.1446, + "step": 13450 + }, + { + "epoch": 0.215216, + "grad_norm": 0.51953125, + "learning_rate": 7.911290322580646e-05, + "loss": 0.1753, + "step": 13451 + }, + { + "epoch": 0.215232, + "grad_norm": 0.95703125, + "learning_rate": 7.911129032258064e-05, + "loss": 0.1633, + "step": 13452 + }, + { + "epoch": 0.215248, + "grad_norm": 0.60546875, + "learning_rate": 7.910967741935484e-05, + "loss": 0.1559, + "step": 13453 + }, + { + "epoch": 0.215264, + "grad_norm": 0.71484375, + "learning_rate": 7.910806451612903e-05, + "loss": 0.1871, + "step": 13454 + }, + { + "epoch": 0.21528, + "grad_norm": 0.6328125, + "learning_rate": 7.910645161290323e-05, + "loss": 0.1482, + "step": 13455 + }, + { + "epoch": 0.215296, + "grad_norm": 0.94921875, + "learning_rate": 7.910483870967743e-05, + "loss": 0.1799, + "step": 13456 + }, + { + "epoch": 0.215312, + "grad_norm": 1.171875, + "learning_rate": 7.910322580645163e-05, + "loss": 0.2161, + "step": 13457 + }, + { + "epoch": 0.215328, + "grad_norm": 0.8515625, + "learning_rate": 7.910161290322581e-05, + "loss": 0.1715, + "step": 13458 + }, + { + "epoch": 0.215344, + "grad_norm": 0.71875, + "learning_rate": 7.910000000000001e-05, + "loss": 0.1672, + "step": 13459 + }, + { + "epoch": 0.21536, + "grad_norm": 0.75, + "learning_rate": 7.90983870967742e-05, + "loss": 0.1667, + "step": 13460 + }, + { + "epoch": 0.215376, + "grad_norm": 0.5859375, + "learning_rate": 7.90967741935484e-05, + "loss": 0.1624, + "step": 13461 + }, + { + "epoch": 0.215392, + "grad_norm": 1.2890625, + "learning_rate": 7.909516129032258e-05, + "loss": 0.2119, + "step": 13462 + }, + { + "epoch": 0.215408, + "grad_norm": 0.66796875, + "learning_rate": 7.909354838709678e-05, + "loss": 0.1525, + "step": 13463 + }, + { + "epoch": 0.215424, + "grad_norm": 0.71484375, + "learning_rate": 7.909193548387097e-05, + "loss": 0.1925, + "step": 13464 + }, + { + "epoch": 0.21544, + "grad_norm": 0.70703125, + "learning_rate": 7.909032258064516e-05, + "loss": 0.1621, + "step": 13465 + }, + { + "epoch": 0.215456, + "grad_norm": 0.671875, + "learning_rate": 7.908870967741935e-05, + "loss": 0.1669, + "step": 13466 + }, + { + "epoch": 0.215472, + "grad_norm": 0.66015625, + "learning_rate": 7.908709677419354e-05, + "loss": 0.1437, + "step": 13467 + }, + { + "epoch": 0.215488, + "grad_norm": 0.65625, + "learning_rate": 7.908548387096774e-05, + "loss": 0.1473, + "step": 13468 + }, + { + "epoch": 0.215504, + "grad_norm": 0.82421875, + "learning_rate": 7.908387096774194e-05, + "loss": 0.1855, + "step": 13469 + }, + { + "epoch": 0.21552, + "grad_norm": 0.5625, + "learning_rate": 7.908225806451614e-05, + "loss": 0.1591, + "step": 13470 + }, + { + "epoch": 0.215536, + "grad_norm": 0.4375, + "learning_rate": 7.908064516129033e-05, + "loss": 0.1275, + "step": 13471 + }, + { + "epoch": 0.215552, + "grad_norm": 0.74609375, + "learning_rate": 7.907903225806453e-05, + "loss": 0.1804, + "step": 13472 + }, + { + "epoch": 0.215568, + "grad_norm": 0.6875, + "learning_rate": 7.907741935483871e-05, + "loss": 0.1948, + "step": 13473 + }, + { + "epoch": 0.215584, + "grad_norm": 0.95703125, + "learning_rate": 7.907580645161291e-05, + "loss": 0.207, + "step": 13474 + }, + { + "epoch": 0.2156, + "grad_norm": 0.75390625, + "learning_rate": 7.90741935483871e-05, + "loss": 0.1592, + "step": 13475 + }, + { + "epoch": 0.215616, + "grad_norm": 0.73828125, + "learning_rate": 7.90725806451613e-05, + "loss": 0.1708, + "step": 13476 + }, + { + "epoch": 0.215632, + "grad_norm": 0.8359375, + "learning_rate": 7.907096774193548e-05, + "loss": 0.1578, + "step": 13477 + }, + { + "epoch": 0.215648, + "grad_norm": 0.9140625, + "learning_rate": 7.906935483870968e-05, + "loss": 0.1907, + "step": 13478 + }, + { + "epoch": 0.215664, + "grad_norm": 0.75, + "learning_rate": 7.906774193548387e-05, + "loss": 0.193, + "step": 13479 + }, + { + "epoch": 0.21568, + "grad_norm": 0.609375, + "learning_rate": 7.906612903225807e-05, + "loss": 0.1801, + "step": 13480 + }, + { + "epoch": 0.215696, + "grad_norm": 1.25, + "learning_rate": 7.906451612903227e-05, + "loss": 0.2003, + "step": 13481 + }, + { + "epoch": 0.215712, + "grad_norm": 0.6640625, + "learning_rate": 7.906290322580645e-05, + "loss": 0.1598, + "step": 13482 + }, + { + "epoch": 0.215728, + "grad_norm": 0.5859375, + "learning_rate": 7.906129032258065e-05, + "loss": 0.1685, + "step": 13483 + }, + { + "epoch": 0.215744, + "grad_norm": 0.68359375, + "learning_rate": 7.905967741935484e-05, + "loss": 0.1359, + "step": 13484 + }, + { + "epoch": 0.21576, + "grad_norm": 0.8515625, + "learning_rate": 7.905806451612904e-05, + "loss": 0.1646, + "step": 13485 + }, + { + "epoch": 0.215776, + "grad_norm": 0.9921875, + "learning_rate": 7.905645161290323e-05, + "loss": 0.1792, + "step": 13486 + }, + { + "epoch": 0.215792, + "grad_norm": 0.75390625, + "learning_rate": 7.905483870967743e-05, + "loss": 0.1633, + "step": 13487 + }, + { + "epoch": 0.215808, + "grad_norm": 0.494140625, + "learning_rate": 7.905322580645161e-05, + "loss": 0.1705, + "step": 13488 + }, + { + "epoch": 0.215824, + "grad_norm": 0.6796875, + "learning_rate": 7.905161290322581e-05, + "loss": 0.1772, + "step": 13489 + }, + { + "epoch": 0.21584, + "grad_norm": 0.875, + "learning_rate": 7.905e-05, + "loss": 0.1664, + "step": 13490 + }, + { + "epoch": 0.215856, + "grad_norm": 0.69140625, + "learning_rate": 7.90483870967742e-05, + "loss": 0.1794, + "step": 13491 + }, + { + "epoch": 0.215872, + "grad_norm": 0.78125, + "learning_rate": 7.90467741935484e-05, + "loss": 0.1702, + "step": 13492 + }, + { + "epoch": 0.215888, + "grad_norm": 0.734375, + "learning_rate": 7.904516129032258e-05, + "loss": 0.1709, + "step": 13493 + }, + { + "epoch": 0.215904, + "grad_norm": 0.6953125, + "learning_rate": 7.904354838709678e-05, + "loss": 0.191, + "step": 13494 + }, + { + "epoch": 0.21592, + "grad_norm": 0.87890625, + "learning_rate": 7.904193548387098e-05, + "loss": 0.1619, + "step": 13495 + }, + { + "epoch": 0.215936, + "grad_norm": 0.5703125, + "learning_rate": 7.904032258064517e-05, + "loss": 0.1597, + "step": 13496 + }, + { + "epoch": 0.215952, + "grad_norm": 0.93359375, + "learning_rate": 7.903870967741935e-05, + "loss": 0.1657, + "step": 13497 + }, + { + "epoch": 0.215968, + "grad_norm": 0.80078125, + "learning_rate": 7.903709677419355e-05, + "loss": 0.1522, + "step": 13498 + }, + { + "epoch": 0.215984, + "grad_norm": 0.4921875, + "learning_rate": 7.903548387096774e-05, + "loss": 0.135, + "step": 13499 + }, + { + "epoch": 0.216, + "grad_norm": 0.71875, + "learning_rate": 7.903387096774194e-05, + "loss": 0.1846, + "step": 13500 + }, + { + "epoch": 0.216016, + "grad_norm": 0.55859375, + "learning_rate": 7.903225806451613e-05, + "loss": 0.1546, + "step": 13501 + }, + { + "epoch": 0.216032, + "grad_norm": 1.203125, + "learning_rate": 7.903064516129032e-05, + "loss": 0.205, + "step": 13502 + }, + { + "epoch": 0.216048, + "grad_norm": 0.5390625, + "learning_rate": 7.902903225806451e-05, + "loss": 0.1332, + "step": 13503 + }, + { + "epoch": 0.216064, + "grad_norm": 0.8203125, + "learning_rate": 7.902741935483871e-05, + "loss": 0.1697, + "step": 13504 + }, + { + "epoch": 0.21608, + "grad_norm": 0.6796875, + "learning_rate": 7.902580645161291e-05, + "loss": 0.1799, + "step": 13505 + }, + { + "epoch": 0.216096, + "grad_norm": 0.490234375, + "learning_rate": 7.902419354838711e-05, + "loss": 0.1403, + "step": 13506 + }, + { + "epoch": 0.216112, + "grad_norm": 0.75, + "learning_rate": 7.90225806451613e-05, + "loss": 0.178, + "step": 13507 + }, + { + "epoch": 0.216128, + "grad_norm": 0.66015625, + "learning_rate": 7.90209677419355e-05, + "loss": 0.1756, + "step": 13508 + }, + { + "epoch": 0.216144, + "grad_norm": 0.63671875, + "learning_rate": 7.901935483870968e-05, + "loss": 0.1455, + "step": 13509 + }, + { + "epoch": 0.21616, + "grad_norm": 0.609375, + "learning_rate": 7.901774193548388e-05, + "loss": 0.1632, + "step": 13510 + }, + { + "epoch": 0.216176, + "grad_norm": 0.55859375, + "learning_rate": 7.901612903225807e-05, + "loss": 0.1564, + "step": 13511 + }, + { + "epoch": 0.216192, + "grad_norm": 0.91015625, + "learning_rate": 7.901451612903225e-05, + "loss": 0.174, + "step": 13512 + }, + { + "epoch": 0.216208, + "grad_norm": 0.94921875, + "learning_rate": 7.901290322580645e-05, + "loss": 0.2124, + "step": 13513 + }, + { + "epoch": 0.216224, + "grad_norm": 0.91015625, + "learning_rate": 7.901129032258064e-05, + "loss": 0.1519, + "step": 13514 + }, + { + "epoch": 0.21624, + "grad_norm": 0.6484375, + "learning_rate": 7.900967741935484e-05, + "loss": 0.1685, + "step": 13515 + }, + { + "epoch": 0.216256, + "grad_norm": 1.0, + "learning_rate": 7.900806451612904e-05, + "loss": 0.1791, + "step": 13516 + }, + { + "epoch": 0.216272, + "grad_norm": 0.6796875, + "learning_rate": 7.900645161290324e-05, + "loss": 0.1386, + "step": 13517 + }, + { + "epoch": 0.216288, + "grad_norm": 0.75, + "learning_rate": 7.900483870967742e-05, + "loss": 0.1236, + "step": 13518 + }, + { + "epoch": 0.216304, + "grad_norm": 0.9453125, + "learning_rate": 7.900322580645162e-05, + "loss": 0.1665, + "step": 13519 + }, + { + "epoch": 0.21632, + "grad_norm": 0.94140625, + "learning_rate": 7.900161290322581e-05, + "loss": 0.1708, + "step": 13520 + }, + { + "epoch": 0.216336, + "grad_norm": 1.171875, + "learning_rate": 7.900000000000001e-05, + "loss": 0.1756, + "step": 13521 + }, + { + "epoch": 0.216352, + "grad_norm": 1.0234375, + "learning_rate": 7.89983870967742e-05, + "loss": 0.2184, + "step": 13522 + }, + { + "epoch": 0.216368, + "grad_norm": 0.7109375, + "learning_rate": 7.89967741935484e-05, + "loss": 0.166, + "step": 13523 + }, + { + "epoch": 0.216384, + "grad_norm": 1.8359375, + "learning_rate": 7.899516129032258e-05, + "loss": 0.1778, + "step": 13524 + }, + { + "epoch": 0.2164, + "grad_norm": 1.203125, + "learning_rate": 7.899354838709678e-05, + "loss": 0.2077, + "step": 13525 + }, + { + "epoch": 0.216416, + "grad_norm": 0.71875, + "learning_rate": 7.899193548387097e-05, + "loss": 0.1755, + "step": 13526 + }, + { + "epoch": 0.216432, + "grad_norm": 0.84765625, + "learning_rate": 7.899032258064515e-05, + "loss": 0.1871, + "step": 13527 + }, + { + "epoch": 0.216448, + "grad_norm": 0.74609375, + "learning_rate": 7.898870967741935e-05, + "loss": 0.1847, + "step": 13528 + }, + { + "epoch": 0.216464, + "grad_norm": 1.1953125, + "learning_rate": 7.898709677419355e-05, + "loss": 0.1599, + "step": 13529 + }, + { + "epoch": 0.21648, + "grad_norm": 0.76953125, + "learning_rate": 7.898548387096775e-05, + "loss": 0.1667, + "step": 13530 + }, + { + "epoch": 0.216496, + "grad_norm": 0.92578125, + "learning_rate": 7.898387096774194e-05, + "loss": 0.1431, + "step": 13531 + }, + { + "epoch": 0.216512, + "grad_norm": 0.88671875, + "learning_rate": 7.898225806451614e-05, + "loss": 0.2013, + "step": 13532 + }, + { + "epoch": 0.216528, + "grad_norm": 1.015625, + "learning_rate": 7.898064516129032e-05, + "loss": 0.1718, + "step": 13533 + }, + { + "epoch": 0.216544, + "grad_norm": 0.90625, + "learning_rate": 7.897903225806452e-05, + "loss": 0.1961, + "step": 13534 + }, + { + "epoch": 0.21656, + "grad_norm": 0.8515625, + "learning_rate": 7.897741935483871e-05, + "loss": 0.1843, + "step": 13535 + }, + { + "epoch": 0.216576, + "grad_norm": 1.1640625, + "learning_rate": 7.897580645161291e-05, + "loss": 0.2277, + "step": 13536 + }, + { + "epoch": 0.216592, + "grad_norm": 0.8359375, + "learning_rate": 7.89741935483871e-05, + "loss": 0.2019, + "step": 13537 + }, + { + "epoch": 0.216608, + "grad_norm": 0.9296875, + "learning_rate": 7.89725806451613e-05, + "loss": 0.1554, + "step": 13538 + }, + { + "epoch": 0.216624, + "grad_norm": 0.77734375, + "learning_rate": 7.897096774193548e-05, + "loss": 0.1797, + "step": 13539 + }, + { + "epoch": 0.21664, + "grad_norm": 0.7421875, + "learning_rate": 7.896935483870968e-05, + "loss": 0.1684, + "step": 13540 + }, + { + "epoch": 0.216656, + "grad_norm": 0.6640625, + "learning_rate": 7.896774193548388e-05, + "loss": 0.1724, + "step": 13541 + }, + { + "epoch": 0.216672, + "grad_norm": 1.1796875, + "learning_rate": 7.896612903225808e-05, + "loss": 0.1674, + "step": 13542 + }, + { + "epoch": 0.216688, + "grad_norm": 0.69140625, + "learning_rate": 7.896451612903227e-05, + "loss": 0.1247, + "step": 13543 + }, + { + "epoch": 0.216704, + "grad_norm": 0.765625, + "learning_rate": 7.896290322580645e-05, + "loss": 0.1679, + "step": 13544 + }, + { + "epoch": 0.21672, + "grad_norm": 0.89453125, + "learning_rate": 7.896129032258065e-05, + "loss": 0.179, + "step": 13545 + }, + { + "epoch": 0.216736, + "grad_norm": 0.58203125, + "learning_rate": 7.895967741935484e-05, + "loss": 0.1168, + "step": 13546 + }, + { + "epoch": 0.216752, + "grad_norm": 1.1640625, + "learning_rate": 7.895806451612904e-05, + "loss": 0.2351, + "step": 13547 + }, + { + "epoch": 0.216768, + "grad_norm": 1.1484375, + "learning_rate": 7.895645161290322e-05, + "loss": 0.1572, + "step": 13548 + }, + { + "epoch": 0.216784, + "grad_norm": 0.671875, + "learning_rate": 7.895483870967742e-05, + "loss": 0.1215, + "step": 13549 + }, + { + "epoch": 0.2168, + "grad_norm": 1.0078125, + "learning_rate": 7.895322580645161e-05, + "loss": 0.1938, + "step": 13550 + }, + { + "epoch": 0.216816, + "grad_norm": 0.90234375, + "learning_rate": 7.895161290322581e-05, + "loss": 0.1867, + "step": 13551 + }, + { + "epoch": 0.216832, + "grad_norm": 0.53515625, + "learning_rate": 7.895000000000001e-05, + "loss": 0.1755, + "step": 13552 + }, + { + "epoch": 0.216848, + "grad_norm": 0.65234375, + "learning_rate": 7.894838709677421e-05, + "loss": 0.1559, + "step": 13553 + }, + { + "epoch": 0.216864, + "grad_norm": 0.75390625, + "learning_rate": 7.89467741935484e-05, + "loss": 0.1805, + "step": 13554 + }, + { + "epoch": 0.21688, + "grad_norm": 0.66015625, + "learning_rate": 7.89451612903226e-05, + "loss": 0.1759, + "step": 13555 + }, + { + "epoch": 0.216896, + "grad_norm": 0.78515625, + "learning_rate": 7.894354838709678e-05, + "loss": 0.1697, + "step": 13556 + }, + { + "epoch": 0.216912, + "grad_norm": 1.0, + "learning_rate": 7.894193548387098e-05, + "loss": 0.1951, + "step": 13557 + }, + { + "epoch": 0.216928, + "grad_norm": 1.2421875, + "learning_rate": 7.894032258064517e-05, + "loss": 0.1896, + "step": 13558 + }, + { + "epoch": 0.216944, + "grad_norm": 0.85546875, + "learning_rate": 7.893870967741935e-05, + "loss": 0.1513, + "step": 13559 + }, + { + "epoch": 0.21696, + "grad_norm": 1.15625, + "learning_rate": 7.893709677419355e-05, + "loss": 0.1723, + "step": 13560 + }, + { + "epoch": 0.216976, + "grad_norm": 0.95703125, + "learning_rate": 7.893548387096774e-05, + "loss": 0.1717, + "step": 13561 + }, + { + "epoch": 0.216992, + "grad_norm": 0.62890625, + "learning_rate": 7.893387096774194e-05, + "loss": 0.1535, + "step": 13562 + }, + { + "epoch": 0.217008, + "grad_norm": 0.82421875, + "learning_rate": 7.893225806451612e-05, + "loss": 0.1879, + "step": 13563 + }, + { + "epoch": 0.217024, + "grad_norm": 0.5390625, + "learning_rate": 7.893064516129032e-05, + "loss": 0.1545, + "step": 13564 + }, + { + "epoch": 0.21704, + "grad_norm": 0.671875, + "learning_rate": 7.892903225806452e-05, + "loss": 0.175, + "step": 13565 + }, + { + "epoch": 0.217056, + "grad_norm": 0.6640625, + "learning_rate": 7.892741935483872e-05, + "loss": 0.1357, + "step": 13566 + }, + { + "epoch": 0.217072, + "grad_norm": 0.5625, + "learning_rate": 7.892580645161291e-05, + "loss": 0.1626, + "step": 13567 + }, + { + "epoch": 0.217088, + "grad_norm": 0.69921875, + "learning_rate": 7.892419354838711e-05, + "loss": 0.1604, + "step": 13568 + }, + { + "epoch": 0.217104, + "grad_norm": 1.09375, + "learning_rate": 7.89225806451613e-05, + "loss": 0.1651, + "step": 13569 + }, + { + "epoch": 0.21712, + "grad_norm": 0.953125, + "learning_rate": 7.892096774193549e-05, + "loss": 0.1932, + "step": 13570 + }, + { + "epoch": 0.217136, + "grad_norm": 1.1171875, + "learning_rate": 7.891935483870968e-05, + "loss": 0.162, + "step": 13571 + }, + { + "epoch": 0.217152, + "grad_norm": 0.796875, + "learning_rate": 7.891774193548388e-05, + "loss": 0.2167, + "step": 13572 + }, + { + "epoch": 0.217168, + "grad_norm": 0.62890625, + "learning_rate": 7.891612903225806e-05, + "loss": 0.1605, + "step": 13573 + }, + { + "epoch": 0.217184, + "grad_norm": 0.65234375, + "learning_rate": 7.891451612903225e-05, + "loss": 0.1678, + "step": 13574 + }, + { + "epoch": 0.2172, + "grad_norm": 0.8203125, + "learning_rate": 7.891290322580645e-05, + "loss": 0.1588, + "step": 13575 + }, + { + "epoch": 0.217216, + "grad_norm": 0.56640625, + "learning_rate": 7.891129032258065e-05, + "loss": 0.1642, + "step": 13576 + }, + { + "epoch": 0.217232, + "grad_norm": 0.64453125, + "learning_rate": 7.890967741935485e-05, + "loss": 0.1551, + "step": 13577 + }, + { + "epoch": 0.217248, + "grad_norm": 0.7421875, + "learning_rate": 7.890806451612904e-05, + "loss": 0.179, + "step": 13578 + }, + { + "epoch": 0.217264, + "grad_norm": 0.87890625, + "learning_rate": 7.890645161290324e-05, + "loss": 0.149, + "step": 13579 + }, + { + "epoch": 0.21728, + "grad_norm": 1.6796875, + "learning_rate": 7.890483870967742e-05, + "loss": 0.1652, + "step": 13580 + }, + { + "epoch": 0.217296, + "grad_norm": 0.59765625, + "learning_rate": 7.890322580645162e-05, + "loss": 0.1605, + "step": 13581 + }, + { + "epoch": 0.217312, + "grad_norm": 1.3828125, + "learning_rate": 7.890161290322581e-05, + "loss": 0.1861, + "step": 13582 + }, + { + "epoch": 0.217328, + "grad_norm": 0.81640625, + "learning_rate": 7.890000000000001e-05, + "loss": 0.1375, + "step": 13583 + }, + { + "epoch": 0.217344, + "grad_norm": 1.15625, + "learning_rate": 7.889838709677419e-05, + "loss": 0.1714, + "step": 13584 + }, + { + "epoch": 0.21736, + "grad_norm": 1.015625, + "learning_rate": 7.889677419354839e-05, + "loss": 0.2262, + "step": 13585 + }, + { + "epoch": 0.217376, + "grad_norm": 0.60546875, + "learning_rate": 7.889516129032258e-05, + "loss": 0.1539, + "step": 13586 + }, + { + "epoch": 0.217392, + "grad_norm": 0.96875, + "learning_rate": 7.889354838709678e-05, + "loss": 0.1647, + "step": 13587 + }, + { + "epoch": 0.217408, + "grad_norm": 0.734375, + "learning_rate": 7.889193548387098e-05, + "loss": 0.181, + "step": 13588 + }, + { + "epoch": 0.217424, + "grad_norm": 1.046875, + "learning_rate": 7.889032258064516e-05, + "loss": 0.1807, + "step": 13589 + }, + { + "epoch": 0.21744, + "grad_norm": 1.0546875, + "learning_rate": 7.888870967741936e-05, + "loss": 0.1695, + "step": 13590 + }, + { + "epoch": 0.217456, + "grad_norm": 0.93359375, + "learning_rate": 7.888709677419355e-05, + "loss": 0.1814, + "step": 13591 + }, + { + "epoch": 0.217472, + "grad_norm": 0.90234375, + "learning_rate": 7.888548387096775e-05, + "loss": 0.1392, + "step": 13592 + }, + { + "epoch": 0.217488, + "grad_norm": 0.6796875, + "learning_rate": 7.888387096774194e-05, + "loss": 0.1754, + "step": 13593 + }, + { + "epoch": 0.217504, + "grad_norm": 0.75, + "learning_rate": 7.888225806451614e-05, + "loss": 0.1901, + "step": 13594 + }, + { + "epoch": 0.21752, + "grad_norm": 0.6171875, + "learning_rate": 7.888064516129032e-05, + "loss": 0.1388, + "step": 13595 + }, + { + "epoch": 0.217536, + "grad_norm": 0.9921875, + "learning_rate": 7.887903225806452e-05, + "loss": 0.1646, + "step": 13596 + }, + { + "epoch": 0.217552, + "grad_norm": 0.94140625, + "learning_rate": 7.887741935483871e-05, + "loss": 0.1521, + "step": 13597 + }, + { + "epoch": 0.217568, + "grad_norm": 0.91796875, + "learning_rate": 7.88758064516129e-05, + "loss": 0.1839, + "step": 13598 + }, + { + "epoch": 0.217584, + "grad_norm": 0.90625, + "learning_rate": 7.887419354838709e-05, + "loss": 0.1545, + "step": 13599 + }, + { + "epoch": 0.2176, + "grad_norm": 0.8984375, + "learning_rate": 7.887258064516129e-05, + "loss": 0.1756, + "step": 13600 + }, + { + "epoch": 0.217616, + "grad_norm": 0.87890625, + "learning_rate": 7.887096774193549e-05, + "loss": 0.1853, + "step": 13601 + }, + { + "epoch": 0.217632, + "grad_norm": 0.81640625, + "learning_rate": 7.886935483870969e-05, + "loss": 0.1809, + "step": 13602 + }, + { + "epoch": 0.217648, + "grad_norm": 0.6953125, + "learning_rate": 7.886774193548388e-05, + "loss": 0.17, + "step": 13603 + }, + { + "epoch": 0.217664, + "grad_norm": 0.94140625, + "learning_rate": 7.886612903225808e-05, + "loss": 0.1897, + "step": 13604 + }, + { + "epoch": 0.21768, + "grad_norm": 0.859375, + "learning_rate": 7.886451612903226e-05, + "loss": 0.1458, + "step": 13605 + }, + { + "epoch": 0.217696, + "grad_norm": 0.8046875, + "learning_rate": 7.886290322580645e-05, + "loss": 0.1591, + "step": 13606 + }, + { + "epoch": 0.217712, + "grad_norm": 0.8125, + "learning_rate": 7.886129032258065e-05, + "loss": 0.1699, + "step": 13607 + }, + { + "epoch": 0.217728, + "grad_norm": 0.64453125, + "learning_rate": 7.885967741935484e-05, + "loss": 0.1598, + "step": 13608 + }, + { + "epoch": 0.217744, + "grad_norm": 1.0078125, + "learning_rate": 7.885806451612903e-05, + "loss": 0.1784, + "step": 13609 + }, + { + "epoch": 0.21776, + "grad_norm": 0.7109375, + "learning_rate": 7.885645161290322e-05, + "loss": 0.1758, + "step": 13610 + }, + { + "epoch": 0.217776, + "grad_norm": 0.69140625, + "learning_rate": 7.885483870967742e-05, + "loss": 0.1822, + "step": 13611 + }, + { + "epoch": 0.217792, + "grad_norm": 0.68359375, + "learning_rate": 7.885322580645162e-05, + "loss": 0.1455, + "step": 13612 + }, + { + "epoch": 0.217808, + "grad_norm": 0.796875, + "learning_rate": 7.885161290322582e-05, + "loss": 0.1606, + "step": 13613 + }, + { + "epoch": 0.217824, + "grad_norm": 0.98828125, + "learning_rate": 7.885e-05, + "loss": 0.2216, + "step": 13614 + }, + { + "epoch": 0.21784, + "grad_norm": 0.86328125, + "learning_rate": 7.88483870967742e-05, + "loss": 0.1879, + "step": 13615 + }, + { + "epoch": 0.217856, + "grad_norm": 0.71875, + "learning_rate": 7.884677419354839e-05, + "loss": 0.1491, + "step": 13616 + }, + { + "epoch": 0.217872, + "grad_norm": 0.83203125, + "learning_rate": 7.884516129032259e-05, + "loss": 0.1813, + "step": 13617 + }, + { + "epoch": 0.217888, + "grad_norm": 0.98046875, + "learning_rate": 7.884354838709678e-05, + "loss": 0.1589, + "step": 13618 + }, + { + "epoch": 0.217904, + "grad_norm": 0.7578125, + "learning_rate": 7.884193548387098e-05, + "loss": 0.1628, + "step": 13619 + }, + { + "epoch": 0.21792, + "grad_norm": 0.7578125, + "learning_rate": 7.884032258064516e-05, + "loss": 0.1601, + "step": 13620 + }, + { + "epoch": 0.217936, + "grad_norm": 1.890625, + "learning_rate": 7.883870967741935e-05, + "loss": 0.2114, + "step": 13621 + }, + { + "epoch": 0.217952, + "grad_norm": 0.76953125, + "learning_rate": 7.883709677419355e-05, + "loss": 0.1705, + "step": 13622 + }, + { + "epoch": 0.217968, + "grad_norm": 1.109375, + "learning_rate": 7.883548387096773e-05, + "loss": 0.1907, + "step": 13623 + }, + { + "epoch": 0.217984, + "grad_norm": 0.88671875, + "learning_rate": 7.883387096774193e-05, + "loss": 0.2173, + "step": 13624 + }, + { + "epoch": 0.218, + "grad_norm": 1.578125, + "learning_rate": 7.883225806451613e-05, + "loss": 0.1704, + "step": 13625 + }, + { + "epoch": 0.218016, + "grad_norm": 1.3671875, + "learning_rate": 7.883064516129033e-05, + "loss": 0.188, + "step": 13626 + }, + { + "epoch": 0.218032, + "grad_norm": 1.1796875, + "learning_rate": 7.882903225806452e-05, + "loss": 0.1821, + "step": 13627 + }, + { + "epoch": 0.218048, + "grad_norm": 0.69140625, + "learning_rate": 7.882741935483872e-05, + "loss": 0.1562, + "step": 13628 + }, + { + "epoch": 0.218064, + "grad_norm": 0.81640625, + "learning_rate": 7.88258064516129e-05, + "loss": 0.1861, + "step": 13629 + }, + { + "epoch": 0.21808, + "grad_norm": 0.78125, + "learning_rate": 7.88241935483871e-05, + "loss": 0.1871, + "step": 13630 + }, + { + "epoch": 0.218096, + "grad_norm": 0.828125, + "learning_rate": 7.882258064516129e-05, + "loss": 0.1531, + "step": 13631 + }, + { + "epoch": 0.218112, + "grad_norm": 0.86328125, + "learning_rate": 7.882096774193549e-05, + "loss": 0.2013, + "step": 13632 + }, + { + "epoch": 0.218128, + "grad_norm": 0.75390625, + "learning_rate": 7.881935483870968e-05, + "loss": 0.1318, + "step": 13633 + }, + { + "epoch": 0.218144, + "grad_norm": 0.98828125, + "learning_rate": 7.881774193548388e-05, + "loss": 0.1966, + "step": 13634 + }, + { + "epoch": 0.21816, + "grad_norm": 0.796875, + "learning_rate": 7.881612903225806e-05, + "loss": 0.1568, + "step": 13635 + }, + { + "epoch": 0.218176, + "grad_norm": 0.609375, + "learning_rate": 7.881451612903226e-05, + "loss": 0.1505, + "step": 13636 + }, + { + "epoch": 0.218192, + "grad_norm": 0.6171875, + "learning_rate": 7.881290322580646e-05, + "loss": 0.1474, + "step": 13637 + }, + { + "epoch": 0.218208, + "grad_norm": 0.76171875, + "learning_rate": 7.881129032258065e-05, + "loss": 0.177, + "step": 13638 + }, + { + "epoch": 0.218224, + "grad_norm": 0.7421875, + "learning_rate": 7.880967741935485e-05, + "loss": 0.1812, + "step": 13639 + }, + { + "epoch": 0.21824, + "grad_norm": 1.1640625, + "learning_rate": 7.880806451612903e-05, + "loss": 0.1583, + "step": 13640 + }, + { + "epoch": 0.218256, + "grad_norm": 0.76171875, + "learning_rate": 7.880645161290323e-05, + "loss": 0.1442, + "step": 13641 + }, + { + "epoch": 0.218272, + "grad_norm": 0.671875, + "learning_rate": 7.880483870967742e-05, + "loss": 0.1398, + "step": 13642 + }, + { + "epoch": 0.218288, + "grad_norm": 0.6484375, + "learning_rate": 7.880322580645162e-05, + "loss": 0.1673, + "step": 13643 + }, + { + "epoch": 0.218304, + "grad_norm": 0.8359375, + "learning_rate": 7.88016129032258e-05, + "loss": 0.1656, + "step": 13644 + }, + { + "epoch": 0.21832, + "grad_norm": 0.62890625, + "learning_rate": 7.88e-05, + "loss": 0.1505, + "step": 13645 + }, + { + "epoch": 0.218336, + "grad_norm": 0.6796875, + "learning_rate": 7.879838709677419e-05, + "loss": 0.1831, + "step": 13646 + }, + { + "epoch": 0.218352, + "grad_norm": 0.69140625, + "learning_rate": 7.879677419354839e-05, + "loss": 0.1754, + "step": 13647 + }, + { + "epoch": 0.218368, + "grad_norm": 0.78515625, + "learning_rate": 7.879516129032259e-05, + "loss": 0.1691, + "step": 13648 + }, + { + "epoch": 0.218384, + "grad_norm": 0.72265625, + "learning_rate": 7.879354838709679e-05, + "loss": 0.1787, + "step": 13649 + }, + { + "epoch": 0.2184, + "grad_norm": 0.5546875, + "learning_rate": 7.879193548387098e-05, + "loss": 0.1456, + "step": 13650 + }, + { + "epoch": 0.218416, + "grad_norm": 0.640625, + "learning_rate": 7.879032258064518e-05, + "loss": 0.1496, + "step": 13651 + }, + { + "epoch": 0.218432, + "grad_norm": 0.5859375, + "learning_rate": 7.878870967741936e-05, + "loss": 0.1882, + "step": 13652 + }, + { + "epoch": 0.218448, + "grad_norm": 1.046875, + "learning_rate": 7.878709677419355e-05, + "loss": 0.155, + "step": 13653 + }, + { + "epoch": 0.218464, + "grad_norm": 0.66015625, + "learning_rate": 7.878548387096775e-05, + "loss": 0.1361, + "step": 13654 + }, + { + "epoch": 0.21848, + "grad_norm": 0.72265625, + "learning_rate": 7.878387096774193e-05, + "loss": 0.1748, + "step": 13655 + }, + { + "epoch": 0.218496, + "grad_norm": 1.03125, + "learning_rate": 7.878225806451613e-05, + "loss": 0.2071, + "step": 13656 + }, + { + "epoch": 0.218512, + "grad_norm": 0.6796875, + "learning_rate": 7.878064516129032e-05, + "loss": 0.1725, + "step": 13657 + }, + { + "epoch": 0.218528, + "grad_norm": 0.52734375, + "learning_rate": 7.877903225806452e-05, + "loss": 0.1486, + "step": 13658 + }, + { + "epoch": 0.218544, + "grad_norm": 1.15625, + "learning_rate": 7.87774193548387e-05, + "loss": 0.137, + "step": 13659 + }, + { + "epoch": 0.21856, + "grad_norm": 0.6015625, + "learning_rate": 7.87758064516129e-05, + "loss": 0.1633, + "step": 13660 + }, + { + "epoch": 0.218576, + "grad_norm": 0.81640625, + "learning_rate": 7.87741935483871e-05, + "loss": 0.1929, + "step": 13661 + }, + { + "epoch": 0.218592, + "grad_norm": 0.93359375, + "learning_rate": 7.87725806451613e-05, + "loss": 0.1519, + "step": 13662 + }, + { + "epoch": 0.218608, + "grad_norm": 0.69140625, + "learning_rate": 7.877096774193549e-05, + "loss": 0.1822, + "step": 13663 + }, + { + "epoch": 0.218624, + "grad_norm": 0.5859375, + "learning_rate": 7.876935483870969e-05, + "loss": 0.1384, + "step": 13664 + }, + { + "epoch": 0.21864, + "grad_norm": 0.84375, + "learning_rate": 7.876774193548388e-05, + "loss": 0.1932, + "step": 13665 + }, + { + "epoch": 0.218656, + "grad_norm": 0.95703125, + "learning_rate": 7.876612903225807e-05, + "loss": 0.1692, + "step": 13666 + }, + { + "epoch": 0.218672, + "grad_norm": 0.58984375, + "learning_rate": 7.876451612903226e-05, + "loss": 0.1401, + "step": 13667 + }, + { + "epoch": 0.218688, + "grad_norm": 0.62890625, + "learning_rate": 7.876290322580645e-05, + "loss": 0.1418, + "step": 13668 + }, + { + "epoch": 0.218704, + "grad_norm": 0.53125, + "learning_rate": 7.876129032258065e-05, + "loss": 0.1477, + "step": 13669 + }, + { + "epoch": 0.21872, + "grad_norm": 1.0390625, + "learning_rate": 7.875967741935483e-05, + "loss": 0.2022, + "step": 13670 + }, + { + "epoch": 0.218736, + "grad_norm": 0.8203125, + "learning_rate": 7.875806451612903e-05, + "loss": 0.1657, + "step": 13671 + }, + { + "epoch": 0.218752, + "grad_norm": 0.984375, + "learning_rate": 7.875645161290323e-05, + "loss": 0.1878, + "step": 13672 + }, + { + "epoch": 0.218768, + "grad_norm": 0.7578125, + "learning_rate": 7.875483870967743e-05, + "loss": 0.1733, + "step": 13673 + }, + { + "epoch": 0.218784, + "grad_norm": 0.99609375, + "learning_rate": 7.875322580645162e-05, + "loss": 0.1562, + "step": 13674 + }, + { + "epoch": 0.2188, + "grad_norm": 0.76953125, + "learning_rate": 7.875161290322582e-05, + "loss": 0.1802, + "step": 13675 + }, + { + "epoch": 0.218816, + "grad_norm": 0.55859375, + "learning_rate": 7.875e-05, + "loss": 0.1629, + "step": 13676 + }, + { + "epoch": 0.218832, + "grad_norm": 0.6875, + "learning_rate": 7.87483870967742e-05, + "loss": 0.1529, + "step": 13677 + }, + { + "epoch": 0.218848, + "grad_norm": 1.0546875, + "learning_rate": 7.874677419354839e-05, + "loss": 0.1944, + "step": 13678 + }, + { + "epoch": 0.218864, + "grad_norm": 0.69921875, + "learning_rate": 7.874516129032259e-05, + "loss": 0.1648, + "step": 13679 + }, + { + "epoch": 0.21888, + "grad_norm": 0.72265625, + "learning_rate": 7.874354838709677e-05, + "loss": 0.1773, + "step": 13680 + }, + { + "epoch": 0.218896, + "grad_norm": 0.69921875, + "learning_rate": 7.874193548387097e-05, + "loss": 0.1681, + "step": 13681 + }, + { + "epoch": 0.218912, + "grad_norm": 1.09375, + "learning_rate": 7.874032258064516e-05, + "loss": 0.1594, + "step": 13682 + }, + { + "epoch": 0.218928, + "grad_norm": 0.72265625, + "learning_rate": 7.873870967741936e-05, + "loss": 0.1859, + "step": 13683 + }, + { + "epoch": 0.218944, + "grad_norm": 0.62890625, + "learning_rate": 7.873709677419355e-05, + "loss": 0.1284, + "step": 13684 + }, + { + "epoch": 0.21896, + "grad_norm": 0.76171875, + "learning_rate": 7.873548387096775e-05, + "loss": 0.1894, + "step": 13685 + }, + { + "epoch": 0.218976, + "grad_norm": 0.6796875, + "learning_rate": 7.873387096774195e-05, + "loss": 0.1902, + "step": 13686 + }, + { + "epoch": 0.218992, + "grad_norm": 0.890625, + "learning_rate": 7.873225806451613e-05, + "loss": 0.2145, + "step": 13687 + }, + { + "epoch": 0.219008, + "grad_norm": 0.8046875, + "learning_rate": 7.873064516129033e-05, + "loss": 0.1827, + "step": 13688 + }, + { + "epoch": 0.219024, + "grad_norm": 0.78125, + "learning_rate": 7.872903225806452e-05, + "loss": 0.162, + "step": 13689 + }, + { + "epoch": 0.21904, + "grad_norm": 0.84765625, + "learning_rate": 7.872741935483872e-05, + "loss": 0.1953, + "step": 13690 + }, + { + "epoch": 0.219056, + "grad_norm": 0.64453125, + "learning_rate": 7.87258064516129e-05, + "loss": 0.1861, + "step": 13691 + }, + { + "epoch": 0.219072, + "grad_norm": 0.62109375, + "learning_rate": 7.87241935483871e-05, + "loss": 0.1354, + "step": 13692 + }, + { + "epoch": 0.219088, + "grad_norm": 0.6875, + "learning_rate": 7.872258064516129e-05, + "loss": 0.19, + "step": 13693 + }, + { + "epoch": 0.219104, + "grad_norm": 0.9140625, + "learning_rate": 7.872096774193549e-05, + "loss": 0.1525, + "step": 13694 + }, + { + "epoch": 0.21912, + "grad_norm": 0.482421875, + "learning_rate": 7.871935483870967e-05, + "loss": 0.1542, + "step": 13695 + }, + { + "epoch": 0.219136, + "grad_norm": 0.8359375, + "learning_rate": 7.871774193548387e-05, + "loss": 0.1823, + "step": 13696 + }, + { + "epoch": 0.219152, + "grad_norm": 0.90625, + "learning_rate": 7.871612903225807e-05, + "loss": 0.1559, + "step": 13697 + }, + { + "epoch": 0.219168, + "grad_norm": 1.1953125, + "learning_rate": 7.871451612903226e-05, + "loss": 0.199, + "step": 13698 + }, + { + "epoch": 0.219184, + "grad_norm": 1.0546875, + "learning_rate": 7.871290322580646e-05, + "loss": 0.2052, + "step": 13699 + }, + { + "epoch": 0.2192, + "grad_norm": 0.91796875, + "learning_rate": 7.871129032258065e-05, + "loss": 0.1784, + "step": 13700 + }, + { + "epoch": 0.219216, + "grad_norm": 0.80859375, + "learning_rate": 7.870967741935484e-05, + "loss": 0.1847, + "step": 13701 + }, + { + "epoch": 0.219232, + "grad_norm": 0.984375, + "learning_rate": 7.870806451612903e-05, + "loss": 0.1671, + "step": 13702 + }, + { + "epoch": 0.219248, + "grad_norm": 0.80859375, + "learning_rate": 7.870645161290323e-05, + "loss": 0.2054, + "step": 13703 + }, + { + "epoch": 0.219264, + "grad_norm": 1.125, + "learning_rate": 7.870483870967742e-05, + "loss": 0.1544, + "step": 13704 + }, + { + "epoch": 0.21928, + "grad_norm": 1.09375, + "learning_rate": 7.870322580645162e-05, + "loss": 0.1804, + "step": 13705 + }, + { + "epoch": 0.219296, + "grad_norm": 0.80859375, + "learning_rate": 7.87016129032258e-05, + "loss": 0.153, + "step": 13706 + }, + { + "epoch": 0.219312, + "grad_norm": 0.7265625, + "learning_rate": 7.87e-05, + "loss": 0.1576, + "step": 13707 + }, + { + "epoch": 0.219328, + "grad_norm": 0.85546875, + "learning_rate": 7.86983870967742e-05, + "loss": 0.1374, + "step": 13708 + }, + { + "epoch": 0.219344, + "grad_norm": 0.91015625, + "learning_rate": 7.86967741935484e-05, + "loss": 0.1546, + "step": 13709 + }, + { + "epoch": 0.21936, + "grad_norm": 0.97265625, + "learning_rate": 7.869516129032259e-05, + "loss": 0.1757, + "step": 13710 + }, + { + "epoch": 0.219376, + "grad_norm": 0.71484375, + "learning_rate": 7.869354838709679e-05, + "loss": 0.1554, + "step": 13711 + }, + { + "epoch": 0.219392, + "grad_norm": 1.09375, + "learning_rate": 7.869193548387097e-05, + "loss": 0.1901, + "step": 13712 + }, + { + "epoch": 0.219408, + "grad_norm": 0.734375, + "learning_rate": 7.869032258064517e-05, + "loss": 0.2091, + "step": 13713 + }, + { + "epoch": 0.219424, + "grad_norm": 1.125, + "learning_rate": 7.868870967741936e-05, + "loss": 0.1749, + "step": 13714 + }, + { + "epoch": 0.21944, + "grad_norm": 0.8515625, + "learning_rate": 7.868709677419354e-05, + "loss": 0.1783, + "step": 13715 + }, + { + "epoch": 0.219456, + "grad_norm": 0.66015625, + "learning_rate": 7.868548387096774e-05, + "loss": 0.1566, + "step": 13716 + }, + { + "epoch": 0.219472, + "grad_norm": 0.67578125, + "learning_rate": 7.868387096774193e-05, + "loss": 0.1313, + "step": 13717 + }, + { + "epoch": 0.219488, + "grad_norm": 1.125, + "learning_rate": 7.868225806451613e-05, + "loss": 0.1463, + "step": 13718 + }, + { + "epoch": 0.219504, + "grad_norm": 1.453125, + "learning_rate": 7.868064516129032e-05, + "loss": 0.2027, + "step": 13719 + }, + { + "epoch": 0.21952, + "grad_norm": 1.2421875, + "learning_rate": 7.867903225806452e-05, + "loss": 0.2334, + "step": 13720 + }, + { + "epoch": 0.219536, + "grad_norm": 2.046875, + "learning_rate": 7.867741935483872e-05, + "loss": 0.1972, + "step": 13721 + }, + { + "epoch": 0.219552, + "grad_norm": 0.94140625, + "learning_rate": 7.867580645161292e-05, + "loss": 0.1622, + "step": 13722 + }, + { + "epoch": 0.219568, + "grad_norm": 1.8984375, + "learning_rate": 7.86741935483871e-05, + "loss": 0.1684, + "step": 13723 + }, + { + "epoch": 0.219584, + "grad_norm": 1.453125, + "learning_rate": 7.86725806451613e-05, + "loss": 0.2306, + "step": 13724 + }, + { + "epoch": 0.2196, + "grad_norm": 0.77734375, + "learning_rate": 7.867096774193549e-05, + "loss": 0.2031, + "step": 13725 + }, + { + "epoch": 0.219616, + "grad_norm": 0.640625, + "learning_rate": 7.866935483870969e-05, + "loss": 0.1663, + "step": 13726 + }, + { + "epoch": 0.219632, + "grad_norm": 0.94140625, + "learning_rate": 7.866774193548387e-05, + "loss": 0.1856, + "step": 13727 + }, + { + "epoch": 0.219648, + "grad_norm": 0.640625, + "learning_rate": 7.866612903225807e-05, + "loss": 0.1432, + "step": 13728 + }, + { + "epoch": 0.219664, + "grad_norm": 0.9296875, + "learning_rate": 7.866451612903226e-05, + "loss": 0.1754, + "step": 13729 + }, + { + "epoch": 0.21968, + "grad_norm": 1.2890625, + "learning_rate": 7.866290322580644e-05, + "loss": 0.1832, + "step": 13730 + }, + { + "epoch": 0.219696, + "grad_norm": 0.93359375, + "learning_rate": 7.866129032258064e-05, + "loss": 0.1429, + "step": 13731 + }, + { + "epoch": 0.219712, + "grad_norm": 1.078125, + "learning_rate": 7.865967741935484e-05, + "loss": 0.1901, + "step": 13732 + }, + { + "epoch": 0.219728, + "grad_norm": 0.69921875, + "learning_rate": 7.865806451612904e-05, + "loss": 0.1636, + "step": 13733 + }, + { + "epoch": 0.219744, + "grad_norm": 0.75390625, + "learning_rate": 7.865645161290323e-05, + "loss": 0.2204, + "step": 13734 + }, + { + "epoch": 0.21976, + "grad_norm": 0.91015625, + "learning_rate": 7.865483870967743e-05, + "loss": 0.1507, + "step": 13735 + }, + { + "epoch": 0.219776, + "grad_norm": 0.68359375, + "learning_rate": 7.865322580645162e-05, + "loss": 0.172, + "step": 13736 + }, + { + "epoch": 0.219792, + "grad_norm": 0.55859375, + "learning_rate": 7.865161290322581e-05, + "loss": 0.1718, + "step": 13737 + }, + { + "epoch": 0.219808, + "grad_norm": 0.5703125, + "learning_rate": 7.865e-05, + "loss": 0.1476, + "step": 13738 + }, + { + "epoch": 0.219824, + "grad_norm": 0.765625, + "learning_rate": 7.86483870967742e-05, + "loss": 0.1695, + "step": 13739 + }, + { + "epoch": 0.21984, + "grad_norm": 0.83984375, + "learning_rate": 7.864677419354839e-05, + "loss": 0.1952, + "step": 13740 + }, + { + "epoch": 0.219856, + "grad_norm": 0.84765625, + "learning_rate": 7.864516129032259e-05, + "loss": 0.1813, + "step": 13741 + }, + { + "epoch": 0.219872, + "grad_norm": 0.84375, + "learning_rate": 7.864354838709677e-05, + "loss": 0.2103, + "step": 13742 + }, + { + "epoch": 0.219888, + "grad_norm": 0.65625, + "learning_rate": 7.864193548387097e-05, + "loss": 0.1961, + "step": 13743 + }, + { + "epoch": 0.219904, + "grad_norm": 0.84765625, + "learning_rate": 7.864032258064517e-05, + "loss": 0.1574, + "step": 13744 + }, + { + "epoch": 0.21992, + "grad_norm": 0.6171875, + "learning_rate": 7.863870967741936e-05, + "loss": 0.1626, + "step": 13745 + }, + { + "epoch": 0.219936, + "grad_norm": 0.88671875, + "learning_rate": 7.863709677419356e-05, + "loss": 0.2175, + "step": 13746 + }, + { + "epoch": 0.219952, + "grad_norm": 0.52734375, + "learning_rate": 7.863548387096774e-05, + "loss": 0.1573, + "step": 13747 + }, + { + "epoch": 0.219968, + "grad_norm": 0.68359375, + "learning_rate": 7.863387096774194e-05, + "loss": 0.1538, + "step": 13748 + }, + { + "epoch": 0.219984, + "grad_norm": 1.4375, + "learning_rate": 7.863225806451613e-05, + "loss": 0.1681, + "step": 13749 + }, + { + "epoch": 0.22, + "grad_norm": 1.0078125, + "learning_rate": 7.863064516129033e-05, + "loss": 0.1682, + "step": 13750 + }, + { + "epoch": 0.220016, + "grad_norm": 0.66015625, + "learning_rate": 7.862903225806451e-05, + "loss": 0.1507, + "step": 13751 + }, + { + "epoch": 0.220032, + "grad_norm": 0.63671875, + "learning_rate": 7.862741935483871e-05, + "loss": 0.1529, + "step": 13752 + }, + { + "epoch": 0.220048, + "grad_norm": 0.6484375, + "learning_rate": 7.86258064516129e-05, + "loss": 0.1445, + "step": 13753 + }, + { + "epoch": 0.220064, + "grad_norm": 0.73828125, + "learning_rate": 7.86241935483871e-05, + "loss": 0.1752, + "step": 13754 + }, + { + "epoch": 0.22008, + "grad_norm": 1.2578125, + "learning_rate": 7.862258064516129e-05, + "loss": 0.1899, + "step": 13755 + }, + { + "epoch": 0.220096, + "grad_norm": 0.85546875, + "learning_rate": 7.862096774193549e-05, + "loss": 0.1961, + "step": 13756 + }, + { + "epoch": 0.220112, + "grad_norm": 0.9375, + "learning_rate": 7.861935483870969e-05, + "loss": 0.1884, + "step": 13757 + }, + { + "epoch": 0.220128, + "grad_norm": 0.7578125, + "learning_rate": 7.861774193548389e-05, + "loss": 0.1504, + "step": 13758 + }, + { + "epoch": 0.220144, + "grad_norm": 1.1171875, + "learning_rate": 7.861612903225807e-05, + "loss": 0.1578, + "step": 13759 + }, + { + "epoch": 0.22016, + "grad_norm": 1.171875, + "learning_rate": 7.861451612903227e-05, + "loss": 0.1716, + "step": 13760 + }, + { + "epoch": 0.220176, + "grad_norm": 0.78125, + "learning_rate": 7.861290322580646e-05, + "loss": 0.1706, + "step": 13761 + }, + { + "epoch": 0.220192, + "grad_norm": 0.55859375, + "learning_rate": 7.861129032258064e-05, + "loss": 0.1658, + "step": 13762 + }, + { + "epoch": 0.220208, + "grad_norm": 0.7734375, + "learning_rate": 7.860967741935484e-05, + "loss": 0.1582, + "step": 13763 + }, + { + "epoch": 0.220224, + "grad_norm": 0.85546875, + "learning_rate": 7.860806451612903e-05, + "loss": 0.1226, + "step": 13764 + }, + { + "epoch": 0.22024, + "grad_norm": 0.8203125, + "learning_rate": 7.860645161290323e-05, + "loss": 0.1509, + "step": 13765 + }, + { + "epoch": 0.220256, + "grad_norm": 0.765625, + "learning_rate": 7.860483870967741e-05, + "loss": 0.1637, + "step": 13766 + }, + { + "epoch": 0.220272, + "grad_norm": 0.8125, + "learning_rate": 7.860322580645161e-05, + "loss": 0.1737, + "step": 13767 + }, + { + "epoch": 0.220288, + "grad_norm": 0.9453125, + "learning_rate": 7.860161290322581e-05, + "loss": 0.1982, + "step": 13768 + }, + { + "epoch": 0.220304, + "grad_norm": 1.0, + "learning_rate": 7.860000000000001e-05, + "loss": 0.2036, + "step": 13769 + }, + { + "epoch": 0.22032, + "grad_norm": 1.3671875, + "learning_rate": 7.85983870967742e-05, + "loss": 0.1809, + "step": 13770 + }, + { + "epoch": 0.220336, + "grad_norm": 0.76953125, + "learning_rate": 7.85967741935484e-05, + "loss": 0.1889, + "step": 13771 + }, + { + "epoch": 0.220352, + "grad_norm": 0.828125, + "learning_rate": 7.859516129032258e-05, + "loss": 0.14, + "step": 13772 + }, + { + "epoch": 0.220368, + "grad_norm": 0.703125, + "learning_rate": 7.859354838709678e-05, + "loss": 0.1416, + "step": 13773 + }, + { + "epoch": 0.220384, + "grad_norm": 0.68359375, + "learning_rate": 7.859193548387097e-05, + "loss": 0.1498, + "step": 13774 + }, + { + "epoch": 0.2204, + "grad_norm": 1.078125, + "learning_rate": 7.859032258064517e-05, + "loss": 0.1659, + "step": 13775 + }, + { + "epoch": 0.220416, + "grad_norm": 1.2421875, + "learning_rate": 7.858870967741936e-05, + "loss": 0.2002, + "step": 13776 + }, + { + "epoch": 0.220432, + "grad_norm": 0.81640625, + "learning_rate": 7.858709677419354e-05, + "loss": 0.1873, + "step": 13777 + }, + { + "epoch": 0.220448, + "grad_norm": 1.09375, + "learning_rate": 7.858548387096774e-05, + "loss": 0.1545, + "step": 13778 + }, + { + "epoch": 0.220464, + "grad_norm": 0.765625, + "learning_rate": 7.858387096774193e-05, + "loss": 0.1713, + "step": 13779 + }, + { + "epoch": 0.22048, + "grad_norm": 0.69921875, + "learning_rate": 7.858225806451613e-05, + "loss": 0.1364, + "step": 13780 + }, + { + "epoch": 0.220496, + "grad_norm": 0.58984375, + "learning_rate": 7.858064516129033e-05, + "loss": 0.13, + "step": 13781 + }, + { + "epoch": 0.220512, + "grad_norm": 0.78515625, + "learning_rate": 7.857903225806453e-05, + "loss": 0.1524, + "step": 13782 + }, + { + "epoch": 0.220528, + "grad_norm": 0.6328125, + "learning_rate": 7.857741935483871e-05, + "loss": 0.1644, + "step": 13783 + }, + { + "epoch": 0.220544, + "grad_norm": 0.58984375, + "learning_rate": 7.857580645161291e-05, + "loss": 0.1497, + "step": 13784 + }, + { + "epoch": 0.22056, + "grad_norm": 0.6171875, + "learning_rate": 7.85741935483871e-05, + "loss": 0.1542, + "step": 13785 + }, + { + "epoch": 0.220576, + "grad_norm": 0.76953125, + "learning_rate": 7.85725806451613e-05, + "loss": 0.1601, + "step": 13786 + }, + { + "epoch": 0.220592, + "grad_norm": 1.421875, + "learning_rate": 7.857096774193548e-05, + "loss": 0.1304, + "step": 13787 + }, + { + "epoch": 0.220608, + "grad_norm": 0.70703125, + "learning_rate": 7.856935483870968e-05, + "loss": 0.1466, + "step": 13788 + }, + { + "epoch": 0.220624, + "grad_norm": 0.70703125, + "learning_rate": 7.856774193548387e-05, + "loss": 0.1917, + "step": 13789 + }, + { + "epoch": 0.22064, + "grad_norm": 0.55078125, + "learning_rate": 7.856612903225807e-05, + "loss": 0.144, + "step": 13790 + }, + { + "epoch": 0.220656, + "grad_norm": 0.875, + "learning_rate": 7.856451612903226e-05, + "loss": 0.173, + "step": 13791 + }, + { + "epoch": 0.220672, + "grad_norm": 0.78515625, + "learning_rate": 7.856290322580646e-05, + "loss": 0.2007, + "step": 13792 + }, + { + "epoch": 0.220688, + "grad_norm": 0.6953125, + "learning_rate": 7.856129032258066e-05, + "loss": 0.1532, + "step": 13793 + }, + { + "epoch": 0.220704, + "grad_norm": 0.7890625, + "learning_rate": 7.855967741935484e-05, + "loss": 0.2043, + "step": 13794 + }, + { + "epoch": 0.22072, + "grad_norm": 0.77734375, + "learning_rate": 7.855806451612904e-05, + "loss": 0.1998, + "step": 13795 + }, + { + "epoch": 0.220736, + "grad_norm": 0.427734375, + "learning_rate": 7.855645161290323e-05, + "loss": 0.1315, + "step": 13796 + }, + { + "epoch": 0.220752, + "grad_norm": 0.89453125, + "learning_rate": 7.855483870967743e-05, + "loss": 0.1658, + "step": 13797 + }, + { + "epoch": 0.220768, + "grad_norm": 0.79296875, + "learning_rate": 7.855322580645161e-05, + "loss": 0.1675, + "step": 13798 + }, + { + "epoch": 0.220784, + "grad_norm": 0.91796875, + "learning_rate": 7.855161290322581e-05, + "loss": 0.1634, + "step": 13799 + }, + { + "epoch": 0.2208, + "grad_norm": 0.75390625, + "learning_rate": 7.855e-05, + "loss": 0.1416, + "step": 13800 + }, + { + "epoch": 0.220816, + "grad_norm": 0.74609375, + "learning_rate": 7.85483870967742e-05, + "loss": 0.1501, + "step": 13801 + }, + { + "epoch": 0.220832, + "grad_norm": 0.85546875, + "learning_rate": 7.854677419354838e-05, + "loss": 0.1489, + "step": 13802 + }, + { + "epoch": 0.220848, + "grad_norm": 0.515625, + "learning_rate": 7.854516129032258e-05, + "loss": 0.1358, + "step": 13803 + }, + { + "epoch": 0.220864, + "grad_norm": 0.5859375, + "learning_rate": 7.854354838709678e-05, + "loss": 0.1448, + "step": 13804 + }, + { + "epoch": 0.22088, + "grad_norm": 1.09375, + "learning_rate": 7.854193548387098e-05, + "loss": 0.1943, + "step": 13805 + }, + { + "epoch": 0.220896, + "grad_norm": 0.80859375, + "learning_rate": 7.854032258064517e-05, + "loss": 0.1665, + "step": 13806 + }, + { + "epoch": 0.220912, + "grad_norm": 0.59765625, + "learning_rate": 7.853870967741936e-05, + "loss": 0.141, + "step": 13807 + }, + { + "epoch": 0.220928, + "grad_norm": 0.8671875, + "learning_rate": 7.853709677419355e-05, + "loss": 0.1886, + "step": 13808 + }, + { + "epoch": 0.220944, + "grad_norm": 0.7578125, + "learning_rate": 7.853548387096774e-05, + "loss": 0.1934, + "step": 13809 + }, + { + "epoch": 0.22096, + "grad_norm": 1.1328125, + "learning_rate": 7.853387096774194e-05, + "loss": 0.1589, + "step": 13810 + }, + { + "epoch": 0.220976, + "grad_norm": 0.8046875, + "learning_rate": 7.853225806451613e-05, + "loss": 0.1745, + "step": 13811 + }, + { + "epoch": 0.220992, + "grad_norm": 0.7265625, + "learning_rate": 7.853064516129033e-05, + "loss": 0.1616, + "step": 13812 + }, + { + "epoch": 0.221008, + "grad_norm": 1.09375, + "learning_rate": 7.852903225806451e-05, + "loss": 0.1987, + "step": 13813 + }, + { + "epoch": 0.221024, + "grad_norm": 0.68359375, + "learning_rate": 7.852741935483871e-05, + "loss": 0.145, + "step": 13814 + }, + { + "epoch": 0.22104, + "grad_norm": 0.72265625, + "learning_rate": 7.85258064516129e-05, + "loss": 0.182, + "step": 13815 + }, + { + "epoch": 0.221056, + "grad_norm": 0.95703125, + "learning_rate": 7.85241935483871e-05, + "loss": 0.1285, + "step": 13816 + }, + { + "epoch": 0.221072, + "grad_norm": 1.0703125, + "learning_rate": 7.85225806451613e-05, + "loss": 0.2031, + "step": 13817 + }, + { + "epoch": 0.221088, + "grad_norm": 0.63671875, + "learning_rate": 7.85209677419355e-05, + "loss": 0.1414, + "step": 13818 + }, + { + "epoch": 0.221104, + "grad_norm": 0.8046875, + "learning_rate": 7.851935483870968e-05, + "loss": 0.2027, + "step": 13819 + }, + { + "epoch": 0.22112, + "grad_norm": 0.90625, + "learning_rate": 7.851774193548388e-05, + "loss": 0.1866, + "step": 13820 + }, + { + "epoch": 0.221136, + "grad_norm": 0.98828125, + "learning_rate": 7.851612903225807e-05, + "loss": 0.1331, + "step": 13821 + }, + { + "epoch": 0.221152, + "grad_norm": 0.73828125, + "learning_rate": 7.851451612903227e-05, + "loss": 0.1885, + "step": 13822 + }, + { + "epoch": 0.221168, + "grad_norm": 1.046875, + "learning_rate": 7.851290322580645e-05, + "loss": 0.1815, + "step": 13823 + }, + { + "epoch": 0.221184, + "grad_norm": 0.671875, + "learning_rate": 7.851129032258064e-05, + "loss": 0.1629, + "step": 13824 + }, + { + "epoch": 0.2212, + "grad_norm": 0.71875, + "learning_rate": 7.850967741935484e-05, + "loss": 0.2136, + "step": 13825 + }, + { + "epoch": 0.221216, + "grad_norm": 0.7890625, + "learning_rate": 7.850806451612903e-05, + "loss": 0.1854, + "step": 13826 + }, + { + "epoch": 0.221232, + "grad_norm": 0.76953125, + "learning_rate": 7.850645161290323e-05, + "loss": 0.1868, + "step": 13827 + }, + { + "epoch": 0.221248, + "grad_norm": 0.5703125, + "learning_rate": 7.850483870967743e-05, + "loss": 0.1582, + "step": 13828 + }, + { + "epoch": 0.221264, + "grad_norm": 0.81640625, + "learning_rate": 7.850322580645163e-05, + "loss": 0.1747, + "step": 13829 + }, + { + "epoch": 0.22128, + "grad_norm": 0.9140625, + "learning_rate": 7.850161290322581e-05, + "loss": 0.1885, + "step": 13830 + }, + { + "epoch": 0.221296, + "grad_norm": 0.93359375, + "learning_rate": 7.850000000000001e-05, + "loss": 0.1308, + "step": 13831 + }, + { + "epoch": 0.221312, + "grad_norm": 1.171875, + "learning_rate": 7.84983870967742e-05, + "loss": 0.2112, + "step": 13832 + }, + { + "epoch": 0.221328, + "grad_norm": 0.79296875, + "learning_rate": 7.84967741935484e-05, + "loss": 0.2, + "step": 13833 + }, + { + "epoch": 0.221344, + "grad_norm": 1.171875, + "learning_rate": 7.849516129032258e-05, + "loss": 0.1804, + "step": 13834 + }, + { + "epoch": 0.22136, + "grad_norm": 0.94140625, + "learning_rate": 7.849354838709678e-05, + "loss": 0.2094, + "step": 13835 + }, + { + "epoch": 0.221376, + "grad_norm": 0.478515625, + "learning_rate": 7.849193548387097e-05, + "loss": 0.1471, + "step": 13836 + }, + { + "epoch": 0.221392, + "grad_norm": 0.828125, + "learning_rate": 7.849032258064517e-05, + "loss": 0.1596, + "step": 13837 + }, + { + "epoch": 0.221408, + "grad_norm": 0.84375, + "learning_rate": 7.848870967741935e-05, + "loss": 0.1729, + "step": 13838 + }, + { + "epoch": 0.221424, + "grad_norm": 1.0546875, + "learning_rate": 7.848709677419355e-05, + "loss": 0.2077, + "step": 13839 + }, + { + "epoch": 0.22144, + "grad_norm": 0.640625, + "learning_rate": 7.848548387096775e-05, + "loss": 0.1662, + "step": 13840 + }, + { + "epoch": 0.221456, + "grad_norm": 0.75, + "learning_rate": 7.848387096774194e-05, + "loss": 0.1915, + "step": 13841 + }, + { + "epoch": 0.221472, + "grad_norm": 1.0234375, + "learning_rate": 7.848225806451614e-05, + "loss": 0.1341, + "step": 13842 + }, + { + "epoch": 0.221488, + "grad_norm": 0.82421875, + "learning_rate": 7.848064516129032e-05, + "loss": 0.1901, + "step": 13843 + }, + { + "epoch": 0.221504, + "grad_norm": 0.671875, + "learning_rate": 7.847903225806452e-05, + "loss": 0.161, + "step": 13844 + }, + { + "epoch": 0.22152, + "grad_norm": 0.94140625, + "learning_rate": 7.847741935483871e-05, + "loss": 0.1723, + "step": 13845 + }, + { + "epoch": 0.221536, + "grad_norm": 0.62890625, + "learning_rate": 7.847580645161291e-05, + "loss": 0.1431, + "step": 13846 + }, + { + "epoch": 0.221552, + "grad_norm": 0.69921875, + "learning_rate": 7.84741935483871e-05, + "loss": 0.1808, + "step": 13847 + }, + { + "epoch": 0.221568, + "grad_norm": 0.8359375, + "learning_rate": 7.84725806451613e-05, + "loss": 0.1762, + "step": 13848 + }, + { + "epoch": 0.221584, + "grad_norm": 1.4296875, + "learning_rate": 7.847096774193548e-05, + "loss": 0.1545, + "step": 13849 + }, + { + "epoch": 0.2216, + "grad_norm": 0.8359375, + "learning_rate": 7.846935483870968e-05, + "loss": 0.171, + "step": 13850 + }, + { + "epoch": 0.221616, + "grad_norm": 0.6328125, + "learning_rate": 7.846774193548387e-05, + "loss": 0.1597, + "step": 13851 + }, + { + "epoch": 0.221632, + "grad_norm": 0.97265625, + "learning_rate": 7.846612903225807e-05, + "loss": 0.174, + "step": 13852 + }, + { + "epoch": 0.221648, + "grad_norm": 0.74609375, + "learning_rate": 7.846451612903227e-05, + "loss": 0.1873, + "step": 13853 + }, + { + "epoch": 0.221664, + "grad_norm": 0.73046875, + "learning_rate": 7.846290322580645e-05, + "loss": 0.1576, + "step": 13854 + }, + { + "epoch": 0.22168, + "grad_norm": 0.75390625, + "learning_rate": 7.846129032258065e-05, + "loss": 0.1841, + "step": 13855 + }, + { + "epoch": 0.221696, + "grad_norm": 0.71875, + "learning_rate": 7.845967741935484e-05, + "loss": 0.1929, + "step": 13856 + }, + { + "epoch": 0.221712, + "grad_norm": 1.0625, + "learning_rate": 7.845806451612904e-05, + "loss": 0.1198, + "step": 13857 + }, + { + "epoch": 0.221728, + "grad_norm": 0.60546875, + "learning_rate": 7.845645161290322e-05, + "loss": 0.1765, + "step": 13858 + }, + { + "epoch": 0.221744, + "grad_norm": 0.64453125, + "learning_rate": 7.845483870967742e-05, + "loss": 0.172, + "step": 13859 + }, + { + "epoch": 0.22176, + "grad_norm": 0.7734375, + "learning_rate": 7.845322580645161e-05, + "loss": 0.1603, + "step": 13860 + }, + { + "epoch": 0.221776, + "grad_norm": 1.734375, + "learning_rate": 7.845161290322581e-05, + "loss": 0.1336, + "step": 13861 + }, + { + "epoch": 0.221792, + "grad_norm": 0.9140625, + "learning_rate": 7.845e-05, + "loss": 0.198, + "step": 13862 + }, + { + "epoch": 0.221808, + "grad_norm": 0.73046875, + "learning_rate": 7.84483870967742e-05, + "loss": 0.1794, + "step": 13863 + }, + { + "epoch": 0.221824, + "grad_norm": 1.0859375, + "learning_rate": 7.84467741935484e-05, + "loss": 0.1504, + "step": 13864 + }, + { + "epoch": 0.22184, + "grad_norm": 1.1640625, + "learning_rate": 7.84451612903226e-05, + "loss": 0.1582, + "step": 13865 + }, + { + "epoch": 0.221856, + "grad_norm": 0.91796875, + "learning_rate": 7.844354838709678e-05, + "loss": 0.194, + "step": 13866 + }, + { + "epoch": 0.221872, + "grad_norm": 0.74609375, + "learning_rate": 7.844193548387098e-05, + "loss": 0.1748, + "step": 13867 + }, + { + "epoch": 0.221888, + "grad_norm": 0.6484375, + "learning_rate": 7.844032258064517e-05, + "loss": 0.1711, + "step": 13868 + }, + { + "epoch": 0.221904, + "grad_norm": 0.59375, + "learning_rate": 7.843870967741935e-05, + "loss": 0.1193, + "step": 13869 + }, + { + "epoch": 0.22192, + "grad_norm": 0.7890625, + "learning_rate": 7.843709677419355e-05, + "loss": 0.2255, + "step": 13870 + }, + { + "epoch": 0.221936, + "grad_norm": 0.875, + "learning_rate": 7.843548387096774e-05, + "loss": 0.1691, + "step": 13871 + }, + { + "epoch": 0.221952, + "grad_norm": 0.6796875, + "learning_rate": 7.843387096774194e-05, + "loss": 0.1816, + "step": 13872 + }, + { + "epoch": 0.221968, + "grad_norm": 0.63671875, + "learning_rate": 7.843225806451612e-05, + "loss": 0.1358, + "step": 13873 + }, + { + "epoch": 0.221984, + "grad_norm": 0.95703125, + "learning_rate": 7.843064516129032e-05, + "loss": 0.1748, + "step": 13874 + }, + { + "epoch": 0.222, + "grad_norm": 0.69140625, + "learning_rate": 7.842903225806451e-05, + "loss": 0.1876, + "step": 13875 + }, + { + "epoch": 0.222016, + "grad_norm": 0.6171875, + "learning_rate": 7.842741935483871e-05, + "loss": 0.15, + "step": 13876 + }, + { + "epoch": 0.222032, + "grad_norm": 0.921875, + "learning_rate": 7.842580645161291e-05, + "loss": 0.201, + "step": 13877 + }, + { + "epoch": 0.222048, + "grad_norm": 0.9296875, + "learning_rate": 7.842419354838711e-05, + "loss": 0.1734, + "step": 13878 + }, + { + "epoch": 0.222064, + "grad_norm": 0.68359375, + "learning_rate": 7.84225806451613e-05, + "loss": 0.1555, + "step": 13879 + }, + { + "epoch": 0.22208, + "grad_norm": 0.9765625, + "learning_rate": 7.84209677419355e-05, + "loss": 0.185, + "step": 13880 + }, + { + "epoch": 0.222096, + "grad_norm": 0.7578125, + "learning_rate": 7.841935483870968e-05, + "loss": 0.1918, + "step": 13881 + }, + { + "epoch": 0.222112, + "grad_norm": 0.82421875, + "learning_rate": 7.841774193548388e-05, + "loss": 0.1576, + "step": 13882 + }, + { + "epoch": 0.222128, + "grad_norm": 0.78125, + "learning_rate": 7.841612903225807e-05, + "loss": 0.1649, + "step": 13883 + }, + { + "epoch": 0.222144, + "grad_norm": 0.7109375, + "learning_rate": 7.841451612903227e-05, + "loss": 0.1377, + "step": 13884 + }, + { + "epoch": 0.22216, + "grad_norm": 1.0859375, + "learning_rate": 7.841290322580645e-05, + "loss": 0.1809, + "step": 13885 + }, + { + "epoch": 0.222176, + "grad_norm": 0.8359375, + "learning_rate": 7.841129032258064e-05, + "loss": 0.2055, + "step": 13886 + }, + { + "epoch": 0.222192, + "grad_norm": 0.60546875, + "learning_rate": 7.840967741935484e-05, + "loss": 0.1543, + "step": 13887 + }, + { + "epoch": 0.222208, + "grad_norm": 0.67578125, + "learning_rate": 7.840806451612904e-05, + "loss": 0.1567, + "step": 13888 + }, + { + "epoch": 0.222224, + "grad_norm": 0.890625, + "learning_rate": 7.840645161290324e-05, + "loss": 0.1627, + "step": 13889 + }, + { + "epoch": 0.22224, + "grad_norm": 0.734375, + "learning_rate": 7.840483870967742e-05, + "loss": 0.1856, + "step": 13890 + }, + { + "epoch": 0.222256, + "grad_norm": 0.73828125, + "learning_rate": 7.840322580645162e-05, + "loss": 0.1729, + "step": 13891 + }, + { + "epoch": 0.222272, + "grad_norm": 1.0234375, + "learning_rate": 7.840161290322581e-05, + "loss": 0.189, + "step": 13892 + }, + { + "epoch": 0.222288, + "grad_norm": 0.671875, + "learning_rate": 7.840000000000001e-05, + "loss": 0.1269, + "step": 13893 + }, + { + "epoch": 0.222304, + "grad_norm": 0.6953125, + "learning_rate": 7.83983870967742e-05, + "loss": 0.1548, + "step": 13894 + }, + { + "epoch": 0.22232, + "grad_norm": 0.94140625, + "learning_rate": 7.83967741935484e-05, + "loss": 0.2006, + "step": 13895 + }, + { + "epoch": 0.222336, + "grad_norm": 1.078125, + "learning_rate": 7.839516129032258e-05, + "loss": 0.1537, + "step": 13896 + }, + { + "epoch": 0.222352, + "grad_norm": 1.0234375, + "learning_rate": 7.839354838709678e-05, + "loss": 0.1593, + "step": 13897 + }, + { + "epoch": 0.222368, + "grad_norm": 0.78125, + "learning_rate": 7.839193548387097e-05, + "loss": 0.1595, + "step": 13898 + }, + { + "epoch": 0.222384, + "grad_norm": 0.56640625, + "learning_rate": 7.839032258064517e-05, + "loss": 0.1567, + "step": 13899 + }, + { + "epoch": 0.2224, + "grad_norm": 1.1953125, + "learning_rate": 7.838870967741937e-05, + "loss": 0.1924, + "step": 13900 + }, + { + "epoch": 0.222416, + "grad_norm": 0.65625, + "learning_rate": 7.838709677419355e-05, + "loss": 0.1893, + "step": 13901 + }, + { + "epoch": 0.222432, + "grad_norm": 0.9765625, + "learning_rate": 7.838548387096775e-05, + "loss": 0.222, + "step": 13902 + }, + { + "epoch": 0.222448, + "grad_norm": 0.6484375, + "learning_rate": 7.838387096774194e-05, + "loss": 0.1665, + "step": 13903 + }, + { + "epoch": 0.222464, + "grad_norm": 0.63671875, + "learning_rate": 7.838225806451614e-05, + "loss": 0.1742, + "step": 13904 + }, + { + "epoch": 0.22248, + "grad_norm": 0.859375, + "learning_rate": 7.838064516129032e-05, + "loss": 0.2009, + "step": 13905 + }, + { + "epoch": 0.222496, + "grad_norm": 0.671875, + "learning_rate": 7.837903225806452e-05, + "loss": 0.1258, + "step": 13906 + }, + { + "epoch": 0.222512, + "grad_norm": 1.2421875, + "learning_rate": 7.837741935483871e-05, + "loss": 0.168, + "step": 13907 + }, + { + "epoch": 0.222528, + "grad_norm": 0.94921875, + "learning_rate": 7.837580645161291e-05, + "loss": 0.1838, + "step": 13908 + }, + { + "epoch": 0.222544, + "grad_norm": 0.703125, + "learning_rate": 7.83741935483871e-05, + "loss": 0.1871, + "step": 13909 + }, + { + "epoch": 0.22256, + "grad_norm": 0.66015625, + "learning_rate": 7.83725806451613e-05, + "loss": 0.1643, + "step": 13910 + }, + { + "epoch": 0.222576, + "grad_norm": 1.0625, + "learning_rate": 7.837096774193548e-05, + "loss": 0.159, + "step": 13911 + }, + { + "epoch": 0.222592, + "grad_norm": 0.8203125, + "learning_rate": 7.836935483870968e-05, + "loss": 0.1533, + "step": 13912 + }, + { + "epoch": 0.222608, + "grad_norm": 0.91015625, + "learning_rate": 7.836774193548388e-05, + "loss": 0.226, + "step": 13913 + }, + { + "epoch": 0.222624, + "grad_norm": 0.93359375, + "learning_rate": 7.836612903225808e-05, + "loss": 0.2264, + "step": 13914 + }, + { + "epoch": 0.22264, + "grad_norm": 0.6484375, + "learning_rate": 7.836451612903226e-05, + "loss": 0.189, + "step": 13915 + }, + { + "epoch": 0.222656, + "grad_norm": 0.62890625, + "learning_rate": 7.836290322580645e-05, + "loss": 0.1651, + "step": 13916 + }, + { + "epoch": 0.222672, + "grad_norm": 0.55078125, + "learning_rate": 7.836129032258065e-05, + "loss": 0.1513, + "step": 13917 + }, + { + "epoch": 0.222688, + "grad_norm": 0.96484375, + "learning_rate": 7.835967741935484e-05, + "loss": 0.1777, + "step": 13918 + }, + { + "epoch": 0.222704, + "grad_norm": 0.64453125, + "learning_rate": 7.835806451612904e-05, + "loss": 0.1591, + "step": 13919 + }, + { + "epoch": 0.22272, + "grad_norm": 0.83203125, + "learning_rate": 7.835645161290322e-05, + "loss": 0.1864, + "step": 13920 + }, + { + "epoch": 0.222736, + "grad_norm": 0.84375, + "learning_rate": 7.835483870967742e-05, + "loss": 0.1739, + "step": 13921 + }, + { + "epoch": 0.222752, + "grad_norm": 1.109375, + "learning_rate": 7.835322580645161e-05, + "loss": 0.2232, + "step": 13922 + }, + { + "epoch": 0.222768, + "grad_norm": 0.71875, + "learning_rate": 7.835161290322581e-05, + "loss": 0.1643, + "step": 13923 + }, + { + "epoch": 0.222784, + "grad_norm": 0.80859375, + "learning_rate": 7.835000000000001e-05, + "loss": 0.2218, + "step": 13924 + }, + { + "epoch": 0.2228, + "grad_norm": 1.0390625, + "learning_rate": 7.83483870967742e-05, + "loss": 0.2278, + "step": 13925 + }, + { + "epoch": 0.222816, + "grad_norm": 0.9375, + "learning_rate": 7.834677419354839e-05, + "loss": 0.2021, + "step": 13926 + }, + { + "epoch": 0.222832, + "grad_norm": 0.6640625, + "learning_rate": 7.834516129032259e-05, + "loss": 0.2093, + "step": 13927 + }, + { + "epoch": 0.222848, + "grad_norm": 0.59375, + "learning_rate": 7.834354838709678e-05, + "loss": 0.1678, + "step": 13928 + }, + { + "epoch": 0.222864, + "grad_norm": 1.2734375, + "learning_rate": 7.834193548387098e-05, + "loss": 0.1766, + "step": 13929 + }, + { + "epoch": 0.22288, + "grad_norm": 0.78125, + "learning_rate": 7.834032258064516e-05, + "loss": 0.1676, + "step": 13930 + }, + { + "epoch": 0.222896, + "grad_norm": 0.54296875, + "learning_rate": 7.833870967741936e-05, + "loss": 0.142, + "step": 13931 + }, + { + "epoch": 0.222912, + "grad_norm": 0.6796875, + "learning_rate": 7.833709677419355e-05, + "loss": 0.1583, + "step": 13932 + }, + { + "epoch": 0.222928, + "grad_norm": 0.61328125, + "learning_rate": 7.833548387096774e-05, + "loss": 0.1581, + "step": 13933 + }, + { + "epoch": 0.222944, + "grad_norm": 0.7265625, + "learning_rate": 7.833387096774194e-05, + "loss": 0.1878, + "step": 13934 + }, + { + "epoch": 0.22296, + "grad_norm": 0.58984375, + "learning_rate": 7.833225806451614e-05, + "loss": 0.1629, + "step": 13935 + }, + { + "epoch": 0.222976, + "grad_norm": 0.94921875, + "learning_rate": 7.833064516129032e-05, + "loss": 0.1574, + "step": 13936 + }, + { + "epoch": 0.222992, + "grad_norm": 1.046875, + "learning_rate": 7.832903225806452e-05, + "loss": 0.1508, + "step": 13937 + }, + { + "epoch": 0.223008, + "grad_norm": 0.71484375, + "learning_rate": 7.832741935483872e-05, + "loss": 0.1426, + "step": 13938 + }, + { + "epoch": 0.223024, + "grad_norm": 0.62109375, + "learning_rate": 7.83258064516129e-05, + "loss": 0.1761, + "step": 13939 + }, + { + "epoch": 0.22304, + "grad_norm": 0.53515625, + "learning_rate": 7.83241935483871e-05, + "loss": 0.1337, + "step": 13940 + }, + { + "epoch": 0.223056, + "grad_norm": 0.78125, + "learning_rate": 7.832258064516129e-05, + "loss": 0.2084, + "step": 13941 + }, + { + "epoch": 0.223072, + "grad_norm": 1.09375, + "learning_rate": 7.832096774193549e-05, + "loss": 0.1724, + "step": 13942 + }, + { + "epoch": 0.223088, + "grad_norm": 1.328125, + "learning_rate": 7.831935483870968e-05, + "loss": 0.1487, + "step": 13943 + }, + { + "epoch": 0.223104, + "grad_norm": 0.625, + "learning_rate": 7.831774193548388e-05, + "loss": 0.1435, + "step": 13944 + }, + { + "epoch": 0.22312, + "grad_norm": 1.0546875, + "learning_rate": 7.831612903225806e-05, + "loss": 0.1932, + "step": 13945 + }, + { + "epoch": 0.223136, + "grad_norm": 0.70703125, + "learning_rate": 7.831451612903226e-05, + "loss": 0.1718, + "step": 13946 + }, + { + "epoch": 0.223152, + "grad_norm": 1.0234375, + "learning_rate": 7.831290322580645e-05, + "loss": 0.1781, + "step": 13947 + }, + { + "epoch": 0.223168, + "grad_norm": 0.78515625, + "learning_rate": 7.831129032258065e-05, + "loss": 0.1603, + "step": 13948 + }, + { + "epoch": 0.223184, + "grad_norm": 0.6875, + "learning_rate": 7.830967741935485e-05, + "loss": 0.1589, + "step": 13949 + }, + { + "epoch": 0.2232, + "grad_norm": 0.97265625, + "learning_rate": 7.830806451612903e-05, + "loss": 0.2374, + "step": 13950 + }, + { + "epoch": 0.223216, + "grad_norm": 1.109375, + "learning_rate": 7.830645161290323e-05, + "loss": 0.185, + "step": 13951 + }, + { + "epoch": 0.223232, + "grad_norm": 0.56640625, + "learning_rate": 7.830483870967742e-05, + "loss": 0.1773, + "step": 13952 + }, + { + "epoch": 0.223248, + "grad_norm": 0.62109375, + "learning_rate": 7.830322580645162e-05, + "loss": 0.1753, + "step": 13953 + }, + { + "epoch": 0.223264, + "grad_norm": 0.78125, + "learning_rate": 7.83016129032258e-05, + "loss": 0.1733, + "step": 13954 + }, + { + "epoch": 0.22328, + "grad_norm": 0.9140625, + "learning_rate": 7.83e-05, + "loss": 0.1651, + "step": 13955 + }, + { + "epoch": 0.223296, + "grad_norm": 0.57421875, + "learning_rate": 7.829838709677419e-05, + "loss": 0.1787, + "step": 13956 + }, + { + "epoch": 0.223312, + "grad_norm": 0.828125, + "learning_rate": 7.829677419354839e-05, + "loss": 0.1641, + "step": 13957 + }, + { + "epoch": 0.223328, + "grad_norm": 0.66796875, + "learning_rate": 7.829516129032258e-05, + "loss": 0.1391, + "step": 13958 + }, + { + "epoch": 0.223344, + "grad_norm": 0.74609375, + "learning_rate": 7.829354838709678e-05, + "loss": 0.1514, + "step": 13959 + }, + { + "epoch": 0.22336, + "grad_norm": 1.296875, + "learning_rate": 7.829193548387098e-05, + "loss": 0.1934, + "step": 13960 + }, + { + "epoch": 0.223376, + "grad_norm": 1.1953125, + "learning_rate": 7.829032258064518e-05, + "loss": 0.2015, + "step": 13961 + }, + { + "epoch": 0.223392, + "grad_norm": 0.82421875, + "learning_rate": 7.828870967741936e-05, + "loss": 0.1808, + "step": 13962 + }, + { + "epoch": 0.223408, + "grad_norm": 0.671875, + "learning_rate": 7.828709677419355e-05, + "loss": 0.1393, + "step": 13963 + }, + { + "epoch": 0.223424, + "grad_norm": 1.0234375, + "learning_rate": 7.828548387096775e-05, + "loss": 0.1767, + "step": 13964 + }, + { + "epoch": 0.22344, + "grad_norm": 1.125, + "learning_rate": 7.828387096774193e-05, + "loss": 0.1671, + "step": 13965 + }, + { + "epoch": 0.223456, + "grad_norm": 0.6328125, + "learning_rate": 7.828225806451613e-05, + "loss": 0.1545, + "step": 13966 + }, + { + "epoch": 0.223472, + "grad_norm": 0.97265625, + "learning_rate": 7.828064516129032e-05, + "loss": 0.2106, + "step": 13967 + }, + { + "epoch": 0.223488, + "grad_norm": 0.77734375, + "learning_rate": 7.827903225806452e-05, + "loss": 0.163, + "step": 13968 + }, + { + "epoch": 0.223504, + "grad_norm": 1.75, + "learning_rate": 7.82774193548387e-05, + "loss": 0.1573, + "step": 13969 + }, + { + "epoch": 0.22352, + "grad_norm": 0.75, + "learning_rate": 7.82758064516129e-05, + "loss": 0.1483, + "step": 13970 + }, + { + "epoch": 0.223536, + "grad_norm": 0.765625, + "learning_rate": 7.827419354838709e-05, + "loss": 0.1455, + "step": 13971 + }, + { + "epoch": 0.223552, + "grad_norm": 1.5546875, + "learning_rate": 7.827258064516129e-05, + "loss": 0.1765, + "step": 13972 + }, + { + "epoch": 0.223568, + "grad_norm": 1.0703125, + "learning_rate": 7.827096774193549e-05, + "loss": 0.1542, + "step": 13973 + }, + { + "epoch": 0.223584, + "grad_norm": 0.765625, + "learning_rate": 7.826935483870969e-05, + "loss": 0.1813, + "step": 13974 + }, + { + "epoch": 0.2236, + "grad_norm": 0.69921875, + "learning_rate": 7.826774193548388e-05, + "loss": 0.163, + "step": 13975 + }, + { + "epoch": 0.223616, + "grad_norm": 1.0625, + "learning_rate": 7.826612903225808e-05, + "loss": 0.2137, + "step": 13976 + }, + { + "epoch": 0.223632, + "grad_norm": 1.1328125, + "learning_rate": 7.826451612903226e-05, + "loss": 0.2287, + "step": 13977 + }, + { + "epoch": 0.223648, + "grad_norm": 1.15625, + "learning_rate": 7.826290322580645e-05, + "loss": 0.1746, + "step": 13978 + }, + { + "epoch": 0.223664, + "grad_norm": 0.984375, + "learning_rate": 7.826129032258065e-05, + "loss": 0.1698, + "step": 13979 + }, + { + "epoch": 0.22368, + "grad_norm": 0.7578125, + "learning_rate": 7.825967741935483e-05, + "loss": 0.1693, + "step": 13980 + }, + { + "epoch": 0.223696, + "grad_norm": 0.9609375, + "learning_rate": 7.825806451612903e-05, + "loss": 0.1966, + "step": 13981 + }, + { + "epoch": 0.223712, + "grad_norm": 0.5703125, + "learning_rate": 7.825645161290322e-05, + "loss": 0.1635, + "step": 13982 + }, + { + "epoch": 0.223728, + "grad_norm": 0.8046875, + "learning_rate": 7.825483870967742e-05, + "loss": 0.1868, + "step": 13983 + }, + { + "epoch": 0.223744, + "grad_norm": 0.61328125, + "learning_rate": 7.825322580645162e-05, + "loss": 0.1567, + "step": 13984 + }, + { + "epoch": 0.22376, + "grad_norm": 0.5703125, + "learning_rate": 7.825161290322582e-05, + "loss": 0.1699, + "step": 13985 + }, + { + "epoch": 0.223776, + "grad_norm": 0.6171875, + "learning_rate": 7.825e-05, + "loss": 0.1655, + "step": 13986 + }, + { + "epoch": 0.223792, + "grad_norm": 0.70703125, + "learning_rate": 7.82483870967742e-05, + "loss": 0.1942, + "step": 13987 + }, + { + "epoch": 0.223808, + "grad_norm": 0.78125, + "learning_rate": 7.824677419354839e-05, + "loss": 0.1829, + "step": 13988 + }, + { + "epoch": 0.223824, + "grad_norm": 1.390625, + "learning_rate": 7.824516129032259e-05, + "loss": 0.1769, + "step": 13989 + }, + { + "epoch": 0.22384, + "grad_norm": 0.73828125, + "learning_rate": 7.824354838709678e-05, + "loss": 0.1744, + "step": 13990 + }, + { + "epoch": 0.223856, + "grad_norm": 1.1171875, + "learning_rate": 7.824193548387098e-05, + "loss": 0.1623, + "step": 13991 + }, + { + "epoch": 0.223872, + "grad_norm": 0.640625, + "learning_rate": 7.824032258064516e-05, + "loss": 0.1887, + "step": 13992 + }, + { + "epoch": 0.223888, + "grad_norm": 1.125, + "learning_rate": 7.823870967741936e-05, + "loss": 0.1294, + "step": 13993 + }, + { + "epoch": 0.223904, + "grad_norm": 0.5546875, + "learning_rate": 7.823709677419355e-05, + "loss": 0.162, + "step": 13994 + }, + { + "epoch": 0.22392, + "grad_norm": 0.875, + "learning_rate": 7.823548387096775e-05, + "loss": 0.1498, + "step": 13995 + }, + { + "epoch": 0.223936, + "grad_norm": 0.91015625, + "learning_rate": 7.823387096774195e-05, + "loss": 0.1688, + "step": 13996 + }, + { + "epoch": 0.223952, + "grad_norm": 1.09375, + "learning_rate": 7.823225806451613e-05, + "loss": 0.155, + "step": 13997 + }, + { + "epoch": 0.223968, + "grad_norm": 0.76953125, + "learning_rate": 7.823064516129033e-05, + "loss": 0.1933, + "step": 13998 + }, + { + "epoch": 0.223984, + "grad_norm": 0.8984375, + "learning_rate": 7.822903225806452e-05, + "loss": 0.1853, + "step": 13999 + }, + { + "epoch": 0.224, + "grad_norm": 0.87109375, + "learning_rate": 7.822741935483872e-05, + "loss": 0.1387, + "step": 14000 + }, + { + "epoch": 0.224016, + "grad_norm": 0.859375, + "learning_rate": 7.82258064516129e-05, + "loss": 0.1736, + "step": 14001 + }, + { + "epoch": 0.224032, + "grad_norm": 0.71875, + "learning_rate": 7.82241935483871e-05, + "loss": 0.198, + "step": 14002 + }, + { + "epoch": 0.224048, + "grad_norm": 0.73828125, + "learning_rate": 7.822258064516129e-05, + "loss": 0.1641, + "step": 14003 + }, + { + "epoch": 0.224064, + "grad_norm": 0.59375, + "learning_rate": 7.822096774193549e-05, + "loss": 0.1551, + "step": 14004 + }, + { + "epoch": 0.22408, + "grad_norm": 0.7421875, + "learning_rate": 7.821935483870968e-05, + "loss": 0.1426, + "step": 14005 + }, + { + "epoch": 0.224096, + "grad_norm": 0.64453125, + "learning_rate": 7.821774193548388e-05, + "loss": 0.1415, + "step": 14006 + }, + { + "epoch": 0.224112, + "grad_norm": 0.7890625, + "learning_rate": 7.821612903225806e-05, + "loss": 0.1962, + "step": 14007 + }, + { + "epoch": 0.224128, + "grad_norm": 0.65625, + "learning_rate": 7.821451612903226e-05, + "loss": 0.1439, + "step": 14008 + }, + { + "epoch": 0.224144, + "grad_norm": 0.83203125, + "learning_rate": 7.821290322580646e-05, + "loss": 0.2072, + "step": 14009 + }, + { + "epoch": 0.22416, + "grad_norm": 0.7421875, + "learning_rate": 7.821129032258065e-05, + "loss": 0.1742, + "step": 14010 + }, + { + "epoch": 0.224176, + "grad_norm": 1.765625, + "learning_rate": 7.820967741935485e-05, + "loss": 0.2172, + "step": 14011 + }, + { + "epoch": 0.224192, + "grad_norm": 0.55078125, + "learning_rate": 7.820806451612903e-05, + "loss": 0.1714, + "step": 14012 + }, + { + "epoch": 0.224208, + "grad_norm": 0.73828125, + "learning_rate": 7.820645161290323e-05, + "loss": 0.1929, + "step": 14013 + }, + { + "epoch": 0.224224, + "grad_norm": 0.91015625, + "learning_rate": 7.820483870967742e-05, + "loss": 0.1603, + "step": 14014 + }, + { + "epoch": 0.22424, + "grad_norm": 1.1328125, + "learning_rate": 7.820322580645162e-05, + "loss": 0.1753, + "step": 14015 + }, + { + "epoch": 0.224256, + "grad_norm": 0.98828125, + "learning_rate": 7.82016129032258e-05, + "loss": 0.1853, + "step": 14016 + }, + { + "epoch": 0.224272, + "grad_norm": 0.9453125, + "learning_rate": 7.82e-05, + "loss": 0.1863, + "step": 14017 + }, + { + "epoch": 0.224288, + "grad_norm": 0.66796875, + "learning_rate": 7.819838709677419e-05, + "loss": 0.1529, + "step": 14018 + }, + { + "epoch": 0.224304, + "grad_norm": 0.796875, + "learning_rate": 7.819677419354839e-05, + "loss": 0.1495, + "step": 14019 + }, + { + "epoch": 0.22432, + "grad_norm": 0.6640625, + "learning_rate": 7.819516129032259e-05, + "loss": 0.1833, + "step": 14020 + }, + { + "epoch": 0.224336, + "grad_norm": 0.65625, + "learning_rate": 7.819354838709679e-05, + "loss": 0.1779, + "step": 14021 + }, + { + "epoch": 0.224352, + "grad_norm": 0.90234375, + "learning_rate": 7.819193548387097e-05, + "loss": 0.1807, + "step": 14022 + }, + { + "epoch": 0.224368, + "grad_norm": 0.5859375, + "learning_rate": 7.819032258064517e-05, + "loss": 0.1421, + "step": 14023 + }, + { + "epoch": 0.224384, + "grad_norm": 0.734375, + "learning_rate": 7.818870967741936e-05, + "loss": 0.1828, + "step": 14024 + }, + { + "epoch": 0.2244, + "grad_norm": 0.75, + "learning_rate": 7.818709677419355e-05, + "loss": 0.1885, + "step": 14025 + }, + { + "epoch": 0.224416, + "grad_norm": 0.8828125, + "learning_rate": 7.818548387096775e-05, + "loss": 0.1664, + "step": 14026 + }, + { + "epoch": 0.224432, + "grad_norm": 0.578125, + "learning_rate": 7.818387096774193e-05, + "loss": 0.1521, + "step": 14027 + }, + { + "epoch": 0.224448, + "grad_norm": 0.671875, + "learning_rate": 7.818225806451613e-05, + "loss": 0.2118, + "step": 14028 + }, + { + "epoch": 0.224464, + "grad_norm": 0.64453125, + "learning_rate": 7.818064516129032e-05, + "loss": 0.1807, + "step": 14029 + }, + { + "epoch": 0.22448, + "grad_norm": 0.66796875, + "learning_rate": 7.817903225806452e-05, + "loss": 0.1512, + "step": 14030 + }, + { + "epoch": 0.224496, + "grad_norm": 0.74609375, + "learning_rate": 7.81774193548387e-05, + "loss": 0.172, + "step": 14031 + }, + { + "epoch": 0.224512, + "grad_norm": 0.53125, + "learning_rate": 7.81758064516129e-05, + "loss": 0.1426, + "step": 14032 + }, + { + "epoch": 0.224528, + "grad_norm": 0.890625, + "learning_rate": 7.81741935483871e-05, + "loss": 0.1775, + "step": 14033 + }, + { + "epoch": 0.224544, + "grad_norm": 0.84765625, + "learning_rate": 7.81725806451613e-05, + "loss": 0.1846, + "step": 14034 + }, + { + "epoch": 0.22456, + "grad_norm": 0.69140625, + "learning_rate": 7.817096774193549e-05, + "loss": 0.1547, + "step": 14035 + }, + { + "epoch": 0.224576, + "grad_norm": 0.7578125, + "learning_rate": 7.816935483870969e-05, + "loss": 0.1517, + "step": 14036 + }, + { + "epoch": 0.224592, + "grad_norm": 0.8828125, + "learning_rate": 7.816774193548387e-05, + "loss": 0.1413, + "step": 14037 + }, + { + "epoch": 0.224608, + "grad_norm": 0.6953125, + "learning_rate": 7.816612903225807e-05, + "loss": 0.1774, + "step": 14038 + }, + { + "epoch": 0.224624, + "grad_norm": 1.15625, + "learning_rate": 7.816451612903226e-05, + "loss": 0.1903, + "step": 14039 + }, + { + "epoch": 0.22464, + "grad_norm": 0.80859375, + "learning_rate": 7.816290322580646e-05, + "loss": 0.1739, + "step": 14040 + }, + { + "epoch": 0.224656, + "grad_norm": 0.80859375, + "learning_rate": 7.816129032258065e-05, + "loss": 0.2104, + "step": 14041 + }, + { + "epoch": 0.224672, + "grad_norm": 0.875, + "learning_rate": 7.815967741935483e-05, + "loss": 0.181, + "step": 14042 + }, + { + "epoch": 0.224688, + "grad_norm": 1.0390625, + "learning_rate": 7.815806451612903e-05, + "loss": 0.1758, + "step": 14043 + }, + { + "epoch": 0.224704, + "grad_norm": 1.1484375, + "learning_rate": 7.815645161290323e-05, + "loss": 0.162, + "step": 14044 + }, + { + "epoch": 0.22472, + "grad_norm": 0.95703125, + "learning_rate": 7.815483870967743e-05, + "loss": 0.1824, + "step": 14045 + }, + { + "epoch": 0.224736, + "grad_norm": 0.65625, + "learning_rate": 7.815322580645162e-05, + "loss": 0.1684, + "step": 14046 + }, + { + "epoch": 0.224752, + "grad_norm": 0.76171875, + "learning_rate": 7.815161290322582e-05, + "loss": 0.2358, + "step": 14047 + }, + { + "epoch": 0.224768, + "grad_norm": 0.625, + "learning_rate": 7.815e-05, + "loss": 0.1611, + "step": 14048 + }, + { + "epoch": 0.224784, + "grad_norm": 1.453125, + "learning_rate": 7.81483870967742e-05, + "loss": 0.2033, + "step": 14049 + }, + { + "epoch": 0.2248, + "grad_norm": 0.6953125, + "learning_rate": 7.814677419354839e-05, + "loss": 0.1923, + "step": 14050 + }, + { + "epoch": 0.224816, + "grad_norm": 1.671875, + "learning_rate": 7.814516129032259e-05, + "loss": 0.1972, + "step": 14051 + }, + { + "epoch": 0.224832, + "grad_norm": 0.93359375, + "learning_rate": 7.814354838709677e-05, + "loss": 0.1639, + "step": 14052 + }, + { + "epoch": 0.224848, + "grad_norm": 1.4296875, + "learning_rate": 7.814193548387097e-05, + "loss": 0.1846, + "step": 14053 + }, + { + "epoch": 0.224864, + "grad_norm": 0.8203125, + "learning_rate": 7.814032258064516e-05, + "loss": 0.1634, + "step": 14054 + }, + { + "epoch": 0.22488, + "grad_norm": 0.765625, + "learning_rate": 7.813870967741936e-05, + "loss": 0.1816, + "step": 14055 + }, + { + "epoch": 0.224896, + "grad_norm": 0.60546875, + "learning_rate": 7.813709677419356e-05, + "loss": 0.181, + "step": 14056 + }, + { + "epoch": 0.224912, + "grad_norm": 0.609375, + "learning_rate": 7.813548387096774e-05, + "loss": 0.1471, + "step": 14057 + }, + { + "epoch": 0.224928, + "grad_norm": 0.486328125, + "learning_rate": 7.813387096774194e-05, + "loss": 0.1558, + "step": 14058 + }, + { + "epoch": 0.224944, + "grad_norm": 0.8046875, + "learning_rate": 7.813225806451613e-05, + "loss": 0.1537, + "step": 14059 + }, + { + "epoch": 0.22496, + "grad_norm": 0.6484375, + "learning_rate": 7.813064516129033e-05, + "loss": 0.1667, + "step": 14060 + }, + { + "epoch": 0.224976, + "grad_norm": 0.70703125, + "learning_rate": 7.812903225806452e-05, + "loss": 0.1887, + "step": 14061 + }, + { + "epoch": 0.224992, + "grad_norm": 1.015625, + "learning_rate": 7.812741935483872e-05, + "loss": 0.188, + "step": 14062 + }, + { + "epoch": 0.225008, + "grad_norm": 0.80859375, + "learning_rate": 7.81258064516129e-05, + "loss": 0.1686, + "step": 14063 + }, + { + "epoch": 0.225024, + "grad_norm": 0.92578125, + "learning_rate": 7.81241935483871e-05, + "loss": 0.1894, + "step": 14064 + }, + { + "epoch": 0.22504, + "grad_norm": 1.0703125, + "learning_rate": 7.812258064516129e-05, + "loss": 0.1771, + "step": 14065 + }, + { + "epoch": 0.225056, + "grad_norm": 0.8125, + "learning_rate": 7.812096774193549e-05, + "loss": 0.1514, + "step": 14066 + }, + { + "epoch": 0.225072, + "grad_norm": 1.234375, + "learning_rate": 7.811935483870967e-05, + "loss": 0.2518, + "step": 14067 + }, + { + "epoch": 0.225088, + "grad_norm": 0.765625, + "learning_rate": 7.811774193548387e-05, + "loss": 0.1362, + "step": 14068 + }, + { + "epoch": 0.225104, + "grad_norm": 0.87890625, + "learning_rate": 7.811612903225807e-05, + "loss": 0.2107, + "step": 14069 + }, + { + "epoch": 0.22512, + "grad_norm": 0.93359375, + "learning_rate": 7.811451612903227e-05, + "loss": 0.2027, + "step": 14070 + }, + { + "epoch": 0.225136, + "grad_norm": 0.4921875, + "learning_rate": 7.811290322580646e-05, + "loss": 0.1782, + "step": 14071 + }, + { + "epoch": 0.225152, + "grad_norm": 0.984375, + "learning_rate": 7.811129032258064e-05, + "loss": 0.1502, + "step": 14072 + }, + { + "epoch": 0.225168, + "grad_norm": 1.125, + "learning_rate": 7.810967741935484e-05, + "loss": 0.1811, + "step": 14073 + }, + { + "epoch": 0.225184, + "grad_norm": 0.8984375, + "learning_rate": 7.810806451612903e-05, + "loss": 0.1768, + "step": 14074 + }, + { + "epoch": 0.2252, + "grad_norm": 1.046875, + "learning_rate": 7.810645161290323e-05, + "loss": 0.1572, + "step": 14075 + }, + { + "epoch": 0.225216, + "grad_norm": 1.3359375, + "learning_rate": 7.810483870967742e-05, + "loss": 0.1875, + "step": 14076 + }, + { + "epoch": 0.225232, + "grad_norm": 0.5546875, + "learning_rate": 7.810322580645162e-05, + "loss": 0.1754, + "step": 14077 + }, + { + "epoch": 0.225248, + "grad_norm": 0.6328125, + "learning_rate": 7.81016129032258e-05, + "loss": 0.125, + "step": 14078 + }, + { + "epoch": 0.225264, + "grad_norm": 1.1875, + "learning_rate": 7.81e-05, + "loss": 0.2091, + "step": 14079 + }, + { + "epoch": 0.22528, + "grad_norm": 0.73046875, + "learning_rate": 7.80983870967742e-05, + "loss": 0.193, + "step": 14080 + }, + { + "epoch": 0.225296, + "grad_norm": 0.5703125, + "learning_rate": 7.80967741935484e-05, + "loss": 0.1764, + "step": 14081 + }, + { + "epoch": 0.225312, + "grad_norm": 0.8515625, + "learning_rate": 7.809516129032259e-05, + "loss": 0.1149, + "step": 14082 + }, + { + "epoch": 0.225328, + "grad_norm": 0.734375, + "learning_rate": 7.809354838709679e-05, + "loss": 0.172, + "step": 14083 + }, + { + "epoch": 0.225344, + "grad_norm": 0.7578125, + "learning_rate": 7.809193548387097e-05, + "loss": 0.1698, + "step": 14084 + }, + { + "epoch": 0.22536, + "grad_norm": 0.68359375, + "learning_rate": 7.809032258064517e-05, + "loss": 0.137, + "step": 14085 + }, + { + "epoch": 0.225376, + "grad_norm": 0.50390625, + "learning_rate": 7.808870967741936e-05, + "loss": 0.1196, + "step": 14086 + }, + { + "epoch": 0.225392, + "grad_norm": 0.68359375, + "learning_rate": 7.808709677419354e-05, + "loss": 0.1589, + "step": 14087 + }, + { + "epoch": 0.225408, + "grad_norm": 0.80078125, + "learning_rate": 7.808548387096774e-05, + "loss": 0.1654, + "step": 14088 + }, + { + "epoch": 0.225424, + "grad_norm": 0.78125, + "learning_rate": 7.808387096774193e-05, + "loss": 0.2089, + "step": 14089 + }, + { + "epoch": 0.22544, + "grad_norm": 0.84375, + "learning_rate": 7.808225806451613e-05, + "loss": 0.1744, + "step": 14090 + }, + { + "epoch": 0.225456, + "grad_norm": 0.8984375, + "learning_rate": 7.808064516129033e-05, + "loss": 0.2314, + "step": 14091 + }, + { + "epoch": 0.225472, + "grad_norm": 0.91015625, + "learning_rate": 7.807903225806453e-05, + "loss": 0.1615, + "step": 14092 + }, + { + "epoch": 0.225488, + "grad_norm": 0.671875, + "learning_rate": 7.807741935483871e-05, + "loss": 0.1409, + "step": 14093 + }, + { + "epoch": 0.225504, + "grad_norm": 1.078125, + "learning_rate": 7.807580645161291e-05, + "loss": 0.1678, + "step": 14094 + }, + { + "epoch": 0.22552, + "grad_norm": 0.77734375, + "learning_rate": 7.80741935483871e-05, + "loss": 0.1588, + "step": 14095 + }, + { + "epoch": 0.225536, + "grad_norm": 0.734375, + "learning_rate": 7.80725806451613e-05, + "loss": 0.1546, + "step": 14096 + }, + { + "epoch": 0.225552, + "grad_norm": 1.1640625, + "learning_rate": 7.807096774193549e-05, + "loss": 0.1694, + "step": 14097 + }, + { + "epoch": 0.225568, + "grad_norm": 0.8828125, + "learning_rate": 7.806935483870969e-05, + "loss": 0.1621, + "step": 14098 + }, + { + "epoch": 0.225584, + "grad_norm": 0.5859375, + "learning_rate": 7.806774193548387e-05, + "loss": 0.1782, + "step": 14099 + }, + { + "epoch": 0.2256, + "grad_norm": 0.625, + "learning_rate": 7.806612903225807e-05, + "loss": 0.2007, + "step": 14100 + }, + { + "epoch": 0.225616, + "grad_norm": 0.546875, + "learning_rate": 7.806451612903226e-05, + "loss": 0.1504, + "step": 14101 + }, + { + "epoch": 0.225632, + "grad_norm": 0.6171875, + "learning_rate": 7.806290322580646e-05, + "loss": 0.156, + "step": 14102 + }, + { + "epoch": 0.225648, + "grad_norm": 0.7265625, + "learning_rate": 7.806129032258064e-05, + "loss": 0.1531, + "step": 14103 + }, + { + "epoch": 0.225664, + "grad_norm": 0.74609375, + "learning_rate": 7.805967741935484e-05, + "loss": 0.1698, + "step": 14104 + }, + { + "epoch": 0.22568, + "grad_norm": 0.88671875, + "learning_rate": 7.805806451612904e-05, + "loss": 0.1732, + "step": 14105 + }, + { + "epoch": 0.225696, + "grad_norm": 1.484375, + "learning_rate": 7.805645161290323e-05, + "loss": 0.1876, + "step": 14106 + }, + { + "epoch": 0.225712, + "grad_norm": 0.66015625, + "learning_rate": 7.805483870967743e-05, + "loss": 0.1403, + "step": 14107 + }, + { + "epoch": 0.225728, + "grad_norm": 0.80859375, + "learning_rate": 7.805322580645161e-05, + "loss": 0.1709, + "step": 14108 + }, + { + "epoch": 0.225744, + "grad_norm": 0.8359375, + "learning_rate": 7.805161290322581e-05, + "loss": 0.1798, + "step": 14109 + }, + { + "epoch": 0.22576, + "grad_norm": 0.984375, + "learning_rate": 7.805e-05, + "loss": 0.1563, + "step": 14110 + }, + { + "epoch": 0.225776, + "grad_norm": 0.6484375, + "learning_rate": 7.80483870967742e-05, + "loss": 0.1422, + "step": 14111 + }, + { + "epoch": 0.225792, + "grad_norm": 0.73828125, + "learning_rate": 7.804677419354839e-05, + "loss": 0.1296, + "step": 14112 + }, + { + "epoch": 0.225808, + "grad_norm": 1.0, + "learning_rate": 7.804516129032259e-05, + "loss": 0.1943, + "step": 14113 + }, + { + "epoch": 0.225824, + "grad_norm": 0.69921875, + "learning_rate": 7.804354838709677e-05, + "loss": 0.1701, + "step": 14114 + }, + { + "epoch": 0.22584, + "grad_norm": 0.59765625, + "learning_rate": 7.804193548387097e-05, + "loss": 0.1475, + "step": 14115 + }, + { + "epoch": 0.225856, + "grad_norm": 0.86328125, + "learning_rate": 7.804032258064517e-05, + "loss": 0.1934, + "step": 14116 + }, + { + "epoch": 0.225872, + "grad_norm": 1.171875, + "learning_rate": 7.803870967741937e-05, + "loss": 0.1786, + "step": 14117 + }, + { + "epoch": 0.225888, + "grad_norm": 0.5703125, + "learning_rate": 7.803709677419356e-05, + "loss": 0.1308, + "step": 14118 + }, + { + "epoch": 0.225904, + "grad_norm": 0.6328125, + "learning_rate": 7.803548387096774e-05, + "loss": 0.183, + "step": 14119 + }, + { + "epoch": 0.22592, + "grad_norm": 0.69140625, + "learning_rate": 7.803387096774194e-05, + "loss": 0.1636, + "step": 14120 + }, + { + "epoch": 0.225936, + "grad_norm": 0.58203125, + "learning_rate": 7.803225806451613e-05, + "loss": 0.1825, + "step": 14121 + }, + { + "epoch": 0.225952, + "grad_norm": 0.73046875, + "learning_rate": 7.803064516129033e-05, + "loss": 0.1632, + "step": 14122 + }, + { + "epoch": 0.225968, + "grad_norm": 0.78515625, + "learning_rate": 7.802903225806451e-05, + "loss": 0.1555, + "step": 14123 + }, + { + "epoch": 0.225984, + "grad_norm": 0.6953125, + "learning_rate": 7.802741935483871e-05, + "loss": 0.1841, + "step": 14124 + }, + { + "epoch": 0.226, + "grad_norm": 0.625, + "learning_rate": 7.80258064516129e-05, + "loss": 0.1921, + "step": 14125 + }, + { + "epoch": 0.226016, + "grad_norm": 0.765625, + "learning_rate": 7.80241935483871e-05, + "loss": 0.1827, + "step": 14126 + }, + { + "epoch": 0.226032, + "grad_norm": 0.76171875, + "learning_rate": 7.802258064516129e-05, + "loss": 0.1368, + "step": 14127 + }, + { + "epoch": 0.226048, + "grad_norm": 0.90234375, + "learning_rate": 7.802096774193548e-05, + "loss": 0.1815, + "step": 14128 + }, + { + "epoch": 0.226064, + "grad_norm": 0.90625, + "learning_rate": 7.801935483870968e-05, + "loss": 0.1768, + "step": 14129 + }, + { + "epoch": 0.22608, + "grad_norm": 1.875, + "learning_rate": 7.801774193548388e-05, + "loss": 0.1963, + "step": 14130 + }, + { + "epoch": 0.226096, + "grad_norm": 0.62109375, + "learning_rate": 7.801612903225807e-05, + "loss": 0.1925, + "step": 14131 + }, + { + "epoch": 0.226112, + "grad_norm": 1.078125, + "learning_rate": 7.801451612903227e-05, + "loss": 0.1846, + "step": 14132 + }, + { + "epoch": 0.226128, + "grad_norm": 1.0625, + "learning_rate": 7.801290322580646e-05, + "loss": 0.1677, + "step": 14133 + }, + { + "epoch": 0.226144, + "grad_norm": 0.72265625, + "learning_rate": 7.801129032258064e-05, + "loss": 0.1817, + "step": 14134 + }, + { + "epoch": 0.22616, + "grad_norm": 0.79296875, + "learning_rate": 7.800967741935484e-05, + "loss": 0.2052, + "step": 14135 + }, + { + "epoch": 0.226176, + "grad_norm": 0.56640625, + "learning_rate": 7.800806451612903e-05, + "loss": 0.1814, + "step": 14136 + }, + { + "epoch": 0.226192, + "grad_norm": 0.490234375, + "learning_rate": 7.800645161290323e-05, + "loss": 0.1489, + "step": 14137 + }, + { + "epoch": 0.226208, + "grad_norm": 0.734375, + "learning_rate": 7.800483870967741e-05, + "loss": 0.1589, + "step": 14138 + }, + { + "epoch": 0.226224, + "grad_norm": 0.65234375, + "learning_rate": 7.800322580645161e-05, + "loss": 0.14, + "step": 14139 + }, + { + "epoch": 0.22624, + "grad_norm": 1.015625, + "learning_rate": 7.800161290322581e-05, + "loss": 0.1954, + "step": 14140 + }, + { + "epoch": 0.226256, + "grad_norm": 1.3984375, + "learning_rate": 7.800000000000001e-05, + "loss": 0.2287, + "step": 14141 + }, + { + "epoch": 0.226272, + "grad_norm": 0.8046875, + "learning_rate": 7.79983870967742e-05, + "loss": 0.2037, + "step": 14142 + }, + { + "epoch": 0.226288, + "grad_norm": 1.0625, + "learning_rate": 7.79967741935484e-05, + "loss": 0.196, + "step": 14143 + }, + { + "epoch": 0.226304, + "grad_norm": 1.015625, + "learning_rate": 7.799516129032258e-05, + "loss": 0.1334, + "step": 14144 + }, + { + "epoch": 0.22632, + "grad_norm": 1.015625, + "learning_rate": 7.799354838709678e-05, + "loss": 0.1501, + "step": 14145 + }, + { + "epoch": 0.226336, + "grad_norm": 0.828125, + "learning_rate": 7.799193548387097e-05, + "loss": 0.1438, + "step": 14146 + }, + { + "epoch": 0.226352, + "grad_norm": 0.88671875, + "learning_rate": 7.799032258064517e-05, + "loss": 0.1639, + "step": 14147 + }, + { + "epoch": 0.226368, + "grad_norm": 0.62890625, + "learning_rate": 7.798870967741936e-05, + "loss": 0.1672, + "step": 14148 + }, + { + "epoch": 0.226384, + "grad_norm": 0.478515625, + "learning_rate": 7.798709677419355e-05, + "loss": 0.1393, + "step": 14149 + }, + { + "epoch": 0.2264, + "grad_norm": 0.87890625, + "learning_rate": 7.798548387096774e-05, + "loss": 0.2443, + "step": 14150 + }, + { + "epoch": 0.226416, + "grad_norm": 1.0703125, + "learning_rate": 7.798387096774194e-05, + "loss": 0.1792, + "step": 14151 + }, + { + "epoch": 0.226432, + "grad_norm": 0.90625, + "learning_rate": 7.798225806451614e-05, + "loss": 0.167, + "step": 14152 + }, + { + "epoch": 0.226448, + "grad_norm": 1.3125, + "learning_rate": 7.798064516129033e-05, + "loss": 0.1637, + "step": 14153 + }, + { + "epoch": 0.226464, + "grad_norm": 0.7265625, + "learning_rate": 7.797903225806453e-05, + "loss": 0.1558, + "step": 14154 + }, + { + "epoch": 0.22648, + "grad_norm": 0.91796875, + "learning_rate": 7.797741935483871e-05, + "loss": 0.185, + "step": 14155 + }, + { + "epoch": 0.226496, + "grad_norm": 0.93359375, + "learning_rate": 7.797580645161291e-05, + "loss": 0.1943, + "step": 14156 + }, + { + "epoch": 0.226512, + "grad_norm": 0.7265625, + "learning_rate": 7.79741935483871e-05, + "loss": 0.1838, + "step": 14157 + }, + { + "epoch": 0.226528, + "grad_norm": 0.96875, + "learning_rate": 7.79725806451613e-05, + "loss": 0.2042, + "step": 14158 + }, + { + "epoch": 0.226544, + "grad_norm": 1.03125, + "learning_rate": 7.797096774193548e-05, + "loss": 0.1801, + "step": 14159 + }, + { + "epoch": 0.22656, + "grad_norm": 1.0703125, + "learning_rate": 7.796935483870968e-05, + "loss": 0.1559, + "step": 14160 + }, + { + "epoch": 0.226576, + "grad_norm": 0.47265625, + "learning_rate": 7.796774193548387e-05, + "loss": 0.1794, + "step": 14161 + }, + { + "epoch": 0.226592, + "grad_norm": 1.4375, + "learning_rate": 7.796612903225807e-05, + "loss": 0.2072, + "step": 14162 + }, + { + "epoch": 0.226608, + "grad_norm": 1.2890625, + "learning_rate": 7.796451612903225e-05, + "loss": 0.1433, + "step": 14163 + }, + { + "epoch": 0.226624, + "grad_norm": 1.21875, + "learning_rate": 7.796290322580645e-05, + "loss": 0.1675, + "step": 14164 + }, + { + "epoch": 0.22664, + "grad_norm": 1.71875, + "learning_rate": 7.796129032258065e-05, + "loss": 0.1679, + "step": 14165 + }, + { + "epoch": 0.226656, + "grad_norm": 1.6171875, + "learning_rate": 7.795967741935484e-05, + "loss": 0.167, + "step": 14166 + }, + { + "epoch": 0.226672, + "grad_norm": 0.921875, + "learning_rate": 7.795806451612904e-05, + "loss": 0.2036, + "step": 14167 + }, + { + "epoch": 0.226688, + "grad_norm": 0.63671875, + "learning_rate": 7.795645161290323e-05, + "loss": 0.177, + "step": 14168 + }, + { + "epoch": 0.226704, + "grad_norm": 0.96484375, + "learning_rate": 7.795483870967743e-05, + "loss": 0.1797, + "step": 14169 + }, + { + "epoch": 0.22672, + "grad_norm": 0.6640625, + "learning_rate": 7.795322580645161e-05, + "loss": 0.1847, + "step": 14170 + }, + { + "epoch": 0.226736, + "grad_norm": 0.76953125, + "learning_rate": 7.795161290322581e-05, + "loss": 0.196, + "step": 14171 + }, + { + "epoch": 0.226752, + "grad_norm": 0.890625, + "learning_rate": 7.795e-05, + "loss": 0.134, + "step": 14172 + }, + { + "epoch": 0.226768, + "grad_norm": 1.0703125, + "learning_rate": 7.79483870967742e-05, + "loss": 0.1554, + "step": 14173 + }, + { + "epoch": 0.226784, + "grad_norm": 0.54296875, + "learning_rate": 7.794677419354838e-05, + "loss": 0.1927, + "step": 14174 + }, + { + "epoch": 0.2268, + "grad_norm": 0.6328125, + "learning_rate": 7.794516129032258e-05, + "loss": 0.1272, + "step": 14175 + }, + { + "epoch": 0.226816, + "grad_norm": 0.72265625, + "learning_rate": 7.794354838709678e-05, + "loss": 0.1385, + "step": 14176 + }, + { + "epoch": 0.226832, + "grad_norm": 0.67578125, + "learning_rate": 7.794193548387098e-05, + "loss": 0.1995, + "step": 14177 + }, + { + "epoch": 0.226848, + "grad_norm": 1.4609375, + "learning_rate": 7.794032258064517e-05, + "loss": 0.1801, + "step": 14178 + }, + { + "epoch": 0.226864, + "grad_norm": 0.58203125, + "learning_rate": 7.793870967741937e-05, + "loss": 0.1713, + "step": 14179 + }, + { + "epoch": 0.22688, + "grad_norm": 0.703125, + "learning_rate": 7.793709677419355e-05, + "loss": 0.165, + "step": 14180 + }, + { + "epoch": 0.226896, + "grad_norm": 0.7109375, + "learning_rate": 7.793548387096774e-05, + "loss": 0.1959, + "step": 14181 + }, + { + "epoch": 0.226912, + "grad_norm": 0.486328125, + "learning_rate": 7.793387096774194e-05, + "loss": 0.1562, + "step": 14182 + }, + { + "epoch": 0.226928, + "grad_norm": 0.6796875, + "learning_rate": 7.793225806451613e-05, + "loss": 0.147, + "step": 14183 + }, + { + "epoch": 0.226944, + "grad_norm": 0.6875, + "learning_rate": 7.793064516129033e-05, + "loss": 0.1478, + "step": 14184 + }, + { + "epoch": 0.22696, + "grad_norm": 0.66796875, + "learning_rate": 7.792903225806451e-05, + "loss": 0.1662, + "step": 14185 + }, + { + "epoch": 0.226976, + "grad_norm": 1.21875, + "learning_rate": 7.792741935483871e-05, + "loss": 0.1653, + "step": 14186 + }, + { + "epoch": 0.226992, + "grad_norm": 1.453125, + "learning_rate": 7.792580645161291e-05, + "loss": 0.212, + "step": 14187 + }, + { + "epoch": 0.227008, + "grad_norm": 0.97265625, + "learning_rate": 7.79241935483871e-05, + "loss": 0.1748, + "step": 14188 + }, + { + "epoch": 0.227024, + "grad_norm": 1.0546875, + "learning_rate": 7.79225806451613e-05, + "loss": 0.1598, + "step": 14189 + }, + { + "epoch": 0.22704, + "grad_norm": 0.890625, + "learning_rate": 7.79209677419355e-05, + "loss": 0.2103, + "step": 14190 + }, + { + "epoch": 0.227056, + "grad_norm": 0.67578125, + "learning_rate": 7.791935483870968e-05, + "loss": 0.1944, + "step": 14191 + }, + { + "epoch": 0.227072, + "grad_norm": 0.77734375, + "learning_rate": 7.791774193548388e-05, + "loss": 0.1461, + "step": 14192 + }, + { + "epoch": 0.227088, + "grad_norm": 0.72265625, + "learning_rate": 7.791612903225807e-05, + "loss": 0.1801, + "step": 14193 + }, + { + "epoch": 0.227104, + "grad_norm": 0.6875, + "learning_rate": 7.791451612903227e-05, + "loss": 0.178, + "step": 14194 + }, + { + "epoch": 0.22712, + "grad_norm": 0.54296875, + "learning_rate": 7.791290322580645e-05, + "loss": 0.167, + "step": 14195 + }, + { + "epoch": 0.227136, + "grad_norm": 0.90234375, + "learning_rate": 7.791129032258064e-05, + "loss": 0.1788, + "step": 14196 + }, + { + "epoch": 0.227152, + "grad_norm": 1.4375, + "learning_rate": 7.790967741935484e-05, + "loss": 0.1695, + "step": 14197 + }, + { + "epoch": 0.227168, + "grad_norm": 0.86328125, + "learning_rate": 7.790806451612903e-05, + "loss": 0.1925, + "step": 14198 + }, + { + "epoch": 0.227184, + "grad_norm": 0.94140625, + "learning_rate": 7.790645161290322e-05, + "loss": 0.1696, + "step": 14199 + }, + { + "epoch": 0.2272, + "grad_norm": 0.98828125, + "learning_rate": 7.790483870967742e-05, + "loss": 0.1828, + "step": 14200 + }, + { + "epoch": 0.227216, + "grad_norm": 1.1484375, + "learning_rate": 7.790322580645162e-05, + "loss": 0.1934, + "step": 14201 + }, + { + "epoch": 0.227232, + "grad_norm": 0.94140625, + "learning_rate": 7.790161290322581e-05, + "loss": 0.2074, + "step": 14202 + }, + { + "epoch": 0.227248, + "grad_norm": 0.6015625, + "learning_rate": 7.790000000000001e-05, + "loss": 0.1638, + "step": 14203 + }, + { + "epoch": 0.227264, + "grad_norm": 0.96875, + "learning_rate": 7.78983870967742e-05, + "loss": 0.1548, + "step": 14204 + }, + { + "epoch": 0.22728, + "grad_norm": 0.57421875, + "learning_rate": 7.78967741935484e-05, + "loss": 0.1819, + "step": 14205 + }, + { + "epoch": 0.227296, + "grad_norm": 0.62109375, + "learning_rate": 7.789516129032258e-05, + "loss": 0.1784, + "step": 14206 + }, + { + "epoch": 0.227312, + "grad_norm": 0.73046875, + "learning_rate": 7.789354838709678e-05, + "loss": 0.1881, + "step": 14207 + }, + { + "epoch": 0.227328, + "grad_norm": 0.83203125, + "learning_rate": 7.789193548387097e-05, + "loss": 0.2096, + "step": 14208 + }, + { + "epoch": 0.227344, + "grad_norm": 0.515625, + "learning_rate": 7.789032258064517e-05, + "loss": 0.1583, + "step": 14209 + }, + { + "epoch": 0.22736, + "grad_norm": 0.734375, + "learning_rate": 7.788870967741935e-05, + "loss": 0.1227, + "step": 14210 + }, + { + "epoch": 0.227376, + "grad_norm": 0.73828125, + "learning_rate": 7.788709677419355e-05, + "loss": 0.158, + "step": 14211 + }, + { + "epoch": 0.227392, + "grad_norm": 0.62890625, + "learning_rate": 7.788548387096775e-05, + "loss": 0.177, + "step": 14212 + }, + { + "epoch": 0.227408, + "grad_norm": 1.046875, + "learning_rate": 7.788387096774194e-05, + "loss": 0.1608, + "step": 14213 + }, + { + "epoch": 0.227424, + "grad_norm": 0.82421875, + "learning_rate": 7.788225806451614e-05, + "loss": 0.1939, + "step": 14214 + }, + { + "epoch": 0.22744, + "grad_norm": 0.466796875, + "learning_rate": 7.788064516129032e-05, + "loss": 0.1206, + "step": 14215 + }, + { + "epoch": 0.227456, + "grad_norm": 1.109375, + "learning_rate": 7.787903225806452e-05, + "loss": 0.1507, + "step": 14216 + }, + { + "epoch": 0.227472, + "grad_norm": 0.63671875, + "learning_rate": 7.787741935483871e-05, + "loss": 0.1401, + "step": 14217 + }, + { + "epoch": 0.227488, + "grad_norm": 0.62109375, + "learning_rate": 7.787580645161291e-05, + "loss": 0.1943, + "step": 14218 + }, + { + "epoch": 0.227504, + "grad_norm": 1.015625, + "learning_rate": 7.78741935483871e-05, + "loss": 0.2018, + "step": 14219 + }, + { + "epoch": 0.22752, + "grad_norm": 0.71484375, + "learning_rate": 7.78725806451613e-05, + "loss": 0.1585, + "step": 14220 + }, + { + "epoch": 0.227536, + "grad_norm": 0.8828125, + "learning_rate": 7.787096774193548e-05, + "loss": 0.162, + "step": 14221 + }, + { + "epoch": 0.227552, + "grad_norm": 0.59765625, + "learning_rate": 7.786935483870968e-05, + "loss": 0.1648, + "step": 14222 + }, + { + "epoch": 0.227568, + "grad_norm": 0.59375, + "learning_rate": 7.786774193548387e-05, + "loss": 0.1985, + "step": 14223 + }, + { + "epoch": 0.227584, + "grad_norm": 0.546875, + "learning_rate": 7.786612903225807e-05, + "loss": 0.1331, + "step": 14224 + }, + { + "epoch": 0.2276, + "grad_norm": 0.86328125, + "learning_rate": 7.786451612903227e-05, + "loss": 0.1903, + "step": 14225 + }, + { + "epoch": 0.227616, + "grad_norm": 0.66796875, + "learning_rate": 7.786290322580647e-05, + "loss": 0.13, + "step": 14226 + }, + { + "epoch": 0.227632, + "grad_norm": 1.140625, + "learning_rate": 7.786129032258065e-05, + "loss": 0.1961, + "step": 14227 + }, + { + "epoch": 0.227648, + "grad_norm": 0.69921875, + "learning_rate": 7.785967741935484e-05, + "loss": 0.168, + "step": 14228 + }, + { + "epoch": 0.227664, + "grad_norm": 0.5625, + "learning_rate": 7.785806451612904e-05, + "loss": 0.1508, + "step": 14229 + }, + { + "epoch": 0.22768, + "grad_norm": 0.66015625, + "learning_rate": 7.785645161290322e-05, + "loss": 0.1943, + "step": 14230 + }, + { + "epoch": 0.227696, + "grad_norm": 0.578125, + "learning_rate": 7.785483870967742e-05, + "loss": 0.1109, + "step": 14231 + }, + { + "epoch": 0.227712, + "grad_norm": 1.5234375, + "learning_rate": 7.785322580645161e-05, + "loss": 0.1904, + "step": 14232 + }, + { + "epoch": 0.227728, + "grad_norm": 1.34375, + "learning_rate": 7.785161290322581e-05, + "loss": 0.1691, + "step": 14233 + }, + { + "epoch": 0.227744, + "grad_norm": 0.70703125, + "learning_rate": 7.785e-05, + "loss": 0.1588, + "step": 14234 + }, + { + "epoch": 0.22776, + "grad_norm": 0.6484375, + "learning_rate": 7.78483870967742e-05, + "loss": 0.1909, + "step": 14235 + }, + { + "epoch": 0.227776, + "grad_norm": 0.6953125, + "learning_rate": 7.78467741935484e-05, + "loss": 0.1666, + "step": 14236 + }, + { + "epoch": 0.227792, + "grad_norm": 0.79296875, + "learning_rate": 7.78451612903226e-05, + "loss": 0.1641, + "step": 14237 + }, + { + "epoch": 0.227808, + "grad_norm": 0.578125, + "learning_rate": 7.784354838709678e-05, + "loss": 0.177, + "step": 14238 + }, + { + "epoch": 0.227824, + "grad_norm": 0.7578125, + "learning_rate": 7.784193548387098e-05, + "loss": 0.1651, + "step": 14239 + }, + { + "epoch": 0.22784, + "grad_norm": 0.7578125, + "learning_rate": 7.784032258064517e-05, + "loss": 0.1815, + "step": 14240 + }, + { + "epoch": 0.227856, + "grad_norm": 0.80859375, + "learning_rate": 7.783870967741937e-05, + "loss": 0.2222, + "step": 14241 + }, + { + "epoch": 0.227872, + "grad_norm": 0.52734375, + "learning_rate": 7.783709677419355e-05, + "loss": 0.1632, + "step": 14242 + }, + { + "epoch": 0.227888, + "grad_norm": 1.0703125, + "learning_rate": 7.783548387096774e-05, + "loss": 0.1863, + "step": 14243 + }, + { + "epoch": 0.227904, + "grad_norm": 1.1796875, + "learning_rate": 7.783387096774194e-05, + "loss": 0.1665, + "step": 14244 + }, + { + "epoch": 0.22792, + "grad_norm": 0.84375, + "learning_rate": 7.783225806451612e-05, + "loss": 0.1739, + "step": 14245 + }, + { + "epoch": 0.227936, + "grad_norm": 0.99609375, + "learning_rate": 7.783064516129032e-05, + "loss": 0.1946, + "step": 14246 + }, + { + "epoch": 0.227952, + "grad_norm": 0.7578125, + "learning_rate": 7.782903225806452e-05, + "loss": 0.1457, + "step": 14247 + }, + { + "epoch": 0.227968, + "grad_norm": 0.5390625, + "learning_rate": 7.782741935483872e-05, + "loss": 0.1215, + "step": 14248 + }, + { + "epoch": 0.227984, + "grad_norm": 0.828125, + "learning_rate": 7.782580645161291e-05, + "loss": 0.2078, + "step": 14249 + }, + { + "epoch": 0.228, + "grad_norm": 0.640625, + "learning_rate": 7.782419354838711e-05, + "loss": 0.1617, + "step": 14250 + }, + { + "epoch": 0.228016, + "grad_norm": 0.5703125, + "learning_rate": 7.78225806451613e-05, + "loss": 0.1545, + "step": 14251 + }, + { + "epoch": 0.228032, + "grad_norm": 0.63671875, + "learning_rate": 7.78209677419355e-05, + "loss": 0.1953, + "step": 14252 + }, + { + "epoch": 0.228048, + "grad_norm": 1.6796875, + "learning_rate": 7.781935483870968e-05, + "loss": 0.218, + "step": 14253 + }, + { + "epoch": 0.228064, + "grad_norm": 0.984375, + "learning_rate": 7.781774193548388e-05, + "loss": 0.1332, + "step": 14254 + }, + { + "epoch": 0.22808, + "grad_norm": 1.21875, + "learning_rate": 7.781612903225807e-05, + "loss": 0.1861, + "step": 14255 + }, + { + "epoch": 0.228096, + "grad_norm": 0.60546875, + "learning_rate": 7.781451612903226e-05, + "loss": 0.1538, + "step": 14256 + }, + { + "epoch": 0.228112, + "grad_norm": 1.375, + "learning_rate": 7.781290322580645e-05, + "loss": 0.2155, + "step": 14257 + }, + { + "epoch": 0.228128, + "grad_norm": 0.69921875, + "learning_rate": 7.781129032258065e-05, + "loss": 0.1899, + "step": 14258 + }, + { + "epoch": 0.228144, + "grad_norm": 0.95703125, + "learning_rate": 7.780967741935484e-05, + "loss": 0.1959, + "step": 14259 + }, + { + "epoch": 0.22816, + "grad_norm": 0.765625, + "learning_rate": 7.780806451612904e-05, + "loss": 0.2133, + "step": 14260 + }, + { + "epoch": 0.228176, + "grad_norm": 0.82421875, + "learning_rate": 7.780645161290324e-05, + "loss": 0.1557, + "step": 14261 + }, + { + "epoch": 0.228192, + "grad_norm": 1.078125, + "learning_rate": 7.780483870967742e-05, + "loss": 0.1264, + "step": 14262 + }, + { + "epoch": 0.228208, + "grad_norm": 0.83984375, + "learning_rate": 7.780322580645162e-05, + "loss": 0.1738, + "step": 14263 + }, + { + "epoch": 0.228224, + "grad_norm": 1.0625, + "learning_rate": 7.780161290322581e-05, + "loss": 0.15, + "step": 14264 + }, + { + "epoch": 0.22824, + "grad_norm": 0.94921875, + "learning_rate": 7.780000000000001e-05, + "loss": 0.184, + "step": 14265 + }, + { + "epoch": 0.228256, + "grad_norm": 0.91796875, + "learning_rate": 7.77983870967742e-05, + "loss": 0.1602, + "step": 14266 + }, + { + "epoch": 0.228272, + "grad_norm": 0.87890625, + "learning_rate": 7.779677419354839e-05, + "loss": 0.179, + "step": 14267 + }, + { + "epoch": 0.228288, + "grad_norm": 0.7578125, + "learning_rate": 7.779516129032258e-05, + "loss": 0.2058, + "step": 14268 + }, + { + "epoch": 0.228304, + "grad_norm": 1.0, + "learning_rate": 7.779354838709678e-05, + "loss": 0.2038, + "step": 14269 + }, + { + "epoch": 0.22832, + "grad_norm": 0.84765625, + "learning_rate": 7.779193548387096e-05, + "loss": 0.1962, + "step": 14270 + }, + { + "epoch": 0.228336, + "grad_norm": 1.140625, + "learning_rate": 7.779032258064516e-05, + "loss": 0.1421, + "step": 14271 + }, + { + "epoch": 0.228352, + "grad_norm": 0.8828125, + "learning_rate": 7.778870967741936e-05, + "loss": 0.168, + "step": 14272 + }, + { + "epoch": 0.228368, + "grad_norm": 0.58203125, + "learning_rate": 7.778709677419356e-05, + "loss": 0.198, + "step": 14273 + }, + { + "epoch": 0.228384, + "grad_norm": 1.2109375, + "learning_rate": 7.778548387096775e-05, + "loss": 0.1581, + "step": 14274 + }, + { + "epoch": 0.2284, + "grad_norm": 0.58984375, + "learning_rate": 7.778387096774194e-05, + "loss": 0.1818, + "step": 14275 + }, + { + "epoch": 0.228416, + "grad_norm": 0.64453125, + "learning_rate": 7.778225806451614e-05, + "loss": 0.1402, + "step": 14276 + }, + { + "epoch": 0.228432, + "grad_norm": 1.171875, + "learning_rate": 7.778064516129032e-05, + "loss": 0.1908, + "step": 14277 + }, + { + "epoch": 0.228448, + "grad_norm": 0.58203125, + "learning_rate": 7.777903225806452e-05, + "loss": 0.1599, + "step": 14278 + }, + { + "epoch": 0.228464, + "grad_norm": 0.9609375, + "learning_rate": 7.777741935483871e-05, + "loss": 0.1675, + "step": 14279 + }, + { + "epoch": 0.22848, + "grad_norm": 0.65234375, + "learning_rate": 7.777580645161291e-05, + "loss": 0.1694, + "step": 14280 + }, + { + "epoch": 0.228496, + "grad_norm": 0.72265625, + "learning_rate": 7.777419354838709e-05, + "loss": 0.1571, + "step": 14281 + }, + { + "epoch": 0.228512, + "grad_norm": 0.6484375, + "learning_rate": 7.777258064516129e-05, + "loss": 0.1476, + "step": 14282 + }, + { + "epoch": 0.228528, + "grad_norm": 1.703125, + "learning_rate": 7.777096774193548e-05, + "loss": 0.1911, + "step": 14283 + }, + { + "epoch": 0.228544, + "grad_norm": 0.74609375, + "learning_rate": 7.776935483870968e-05, + "loss": 0.1792, + "step": 14284 + }, + { + "epoch": 0.22856, + "grad_norm": 0.92578125, + "learning_rate": 7.776774193548388e-05, + "loss": 0.1673, + "step": 14285 + }, + { + "epoch": 0.228576, + "grad_norm": 1.296875, + "learning_rate": 7.776612903225808e-05, + "loss": 0.1706, + "step": 14286 + }, + { + "epoch": 0.228592, + "grad_norm": 0.6796875, + "learning_rate": 7.776451612903226e-05, + "loss": 0.1514, + "step": 14287 + }, + { + "epoch": 0.228608, + "grad_norm": 0.796875, + "learning_rate": 7.776290322580646e-05, + "loss": 0.166, + "step": 14288 + }, + { + "epoch": 0.228624, + "grad_norm": 0.5546875, + "learning_rate": 7.776129032258065e-05, + "loss": 0.1581, + "step": 14289 + }, + { + "epoch": 0.22864, + "grad_norm": 0.46875, + "learning_rate": 7.775967741935484e-05, + "loss": 0.1272, + "step": 14290 + }, + { + "epoch": 0.228656, + "grad_norm": 1.0, + "learning_rate": 7.775806451612904e-05, + "loss": 0.2007, + "step": 14291 + }, + { + "epoch": 0.228672, + "grad_norm": 0.86328125, + "learning_rate": 7.775645161290322e-05, + "loss": 0.1894, + "step": 14292 + }, + { + "epoch": 0.228688, + "grad_norm": 1.0390625, + "learning_rate": 7.775483870967742e-05, + "loss": 0.2174, + "step": 14293 + }, + { + "epoch": 0.228704, + "grad_norm": 0.9296875, + "learning_rate": 7.775322580645161e-05, + "loss": 0.1782, + "step": 14294 + }, + { + "epoch": 0.22872, + "grad_norm": 1.078125, + "learning_rate": 7.77516129032258e-05, + "loss": 0.1879, + "step": 14295 + }, + { + "epoch": 0.228736, + "grad_norm": 0.7734375, + "learning_rate": 7.775e-05, + "loss": 0.2214, + "step": 14296 + }, + { + "epoch": 0.228752, + "grad_norm": 0.6015625, + "learning_rate": 7.77483870967742e-05, + "loss": 0.1539, + "step": 14297 + }, + { + "epoch": 0.228768, + "grad_norm": 0.7421875, + "learning_rate": 7.774677419354839e-05, + "loss": 0.1923, + "step": 14298 + }, + { + "epoch": 0.228784, + "grad_norm": 1.0234375, + "learning_rate": 7.774516129032259e-05, + "loss": 0.1761, + "step": 14299 + }, + { + "epoch": 0.2288, + "grad_norm": 0.8125, + "learning_rate": 7.774354838709678e-05, + "loss": 0.1868, + "step": 14300 + }, + { + "epoch": 0.228816, + "grad_norm": 1.2578125, + "learning_rate": 7.774193548387098e-05, + "loss": 0.1763, + "step": 14301 + }, + { + "epoch": 0.228832, + "grad_norm": 0.578125, + "learning_rate": 7.774032258064516e-05, + "loss": 0.2278, + "step": 14302 + }, + { + "epoch": 0.228848, + "grad_norm": 1.0390625, + "learning_rate": 7.773870967741936e-05, + "loss": 0.1853, + "step": 14303 + }, + { + "epoch": 0.228864, + "grad_norm": 0.65625, + "learning_rate": 7.773709677419355e-05, + "loss": 0.1989, + "step": 14304 + }, + { + "epoch": 0.22888, + "grad_norm": 1.1015625, + "learning_rate": 7.773548387096773e-05, + "loss": 0.1609, + "step": 14305 + }, + { + "epoch": 0.228896, + "grad_norm": 0.92578125, + "learning_rate": 7.773387096774193e-05, + "loss": 0.1663, + "step": 14306 + }, + { + "epoch": 0.228912, + "grad_norm": 1.0390625, + "learning_rate": 7.773225806451613e-05, + "loss": 0.1975, + "step": 14307 + }, + { + "epoch": 0.228928, + "grad_norm": 0.7109375, + "learning_rate": 7.773064516129033e-05, + "loss": 0.1622, + "step": 14308 + }, + { + "epoch": 0.228944, + "grad_norm": 0.8828125, + "learning_rate": 7.772903225806452e-05, + "loss": 0.1735, + "step": 14309 + }, + { + "epoch": 0.22896, + "grad_norm": 0.546875, + "learning_rate": 7.772741935483872e-05, + "loss": 0.1774, + "step": 14310 + }, + { + "epoch": 0.228976, + "grad_norm": 1.078125, + "learning_rate": 7.77258064516129e-05, + "loss": 0.1784, + "step": 14311 + }, + { + "epoch": 0.228992, + "grad_norm": 1.21875, + "learning_rate": 7.77241935483871e-05, + "loss": 0.1961, + "step": 14312 + }, + { + "epoch": 0.229008, + "grad_norm": 0.6640625, + "learning_rate": 7.772258064516129e-05, + "loss": 0.1728, + "step": 14313 + }, + { + "epoch": 0.229024, + "grad_norm": 0.640625, + "learning_rate": 7.772096774193549e-05, + "loss": 0.1714, + "step": 14314 + }, + { + "epoch": 0.22904, + "grad_norm": 1.0625, + "learning_rate": 7.771935483870968e-05, + "loss": 0.1673, + "step": 14315 + }, + { + "epoch": 0.229056, + "grad_norm": 0.9765625, + "learning_rate": 7.771774193548388e-05, + "loss": 0.213, + "step": 14316 + }, + { + "epoch": 0.229072, + "grad_norm": 0.92578125, + "learning_rate": 7.771612903225806e-05, + "loss": 0.1285, + "step": 14317 + }, + { + "epoch": 0.229088, + "grad_norm": 0.67578125, + "learning_rate": 7.771451612903226e-05, + "loss": 0.1843, + "step": 14318 + }, + { + "epoch": 0.229104, + "grad_norm": 0.98046875, + "learning_rate": 7.771290322580645e-05, + "loss": 0.1552, + "step": 14319 + }, + { + "epoch": 0.22912, + "grad_norm": 0.87109375, + "learning_rate": 7.771129032258065e-05, + "loss": 0.1553, + "step": 14320 + }, + { + "epoch": 0.229136, + "grad_norm": 0.79296875, + "learning_rate": 7.770967741935485e-05, + "loss": 0.2047, + "step": 14321 + }, + { + "epoch": 0.229152, + "grad_norm": 0.77734375, + "learning_rate": 7.770806451612903e-05, + "loss": 0.1507, + "step": 14322 + }, + { + "epoch": 0.229168, + "grad_norm": 0.72265625, + "learning_rate": 7.770645161290323e-05, + "loss": 0.1504, + "step": 14323 + }, + { + "epoch": 0.229184, + "grad_norm": 0.765625, + "learning_rate": 7.770483870967742e-05, + "loss": 0.1374, + "step": 14324 + }, + { + "epoch": 0.2292, + "grad_norm": 0.87109375, + "learning_rate": 7.770322580645162e-05, + "loss": 0.15, + "step": 14325 + }, + { + "epoch": 0.229216, + "grad_norm": 0.765625, + "learning_rate": 7.77016129032258e-05, + "loss": 0.1563, + "step": 14326 + }, + { + "epoch": 0.229232, + "grad_norm": 0.69140625, + "learning_rate": 7.77e-05, + "loss": 0.1755, + "step": 14327 + }, + { + "epoch": 0.229248, + "grad_norm": 0.65625, + "learning_rate": 7.769838709677419e-05, + "loss": 0.1813, + "step": 14328 + }, + { + "epoch": 0.229264, + "grad_norm": 0.515625, + "learning_rate": 7.769677419354839e-05, + "loss": 0.1657, + "step": 14329 + }, + { + "epoch": 0.22928, + "grad_norm": 0.859375, + "learning_rate": 7.769516129032258e-05, + "loss": 0.1289, + "step": 14330 + }, + { + "epoch": 0.229296, + "grad_norm": 0.625, + "learning_rate": 7.769354838709678e-05, + "loss": 0.1549, + "step": 14331 + }, + { + "epoch": 0.229312, + "grad_norm": 1.0703125, + "learning_rate": 7.769193548387098e-05, + "loss": 0.1958, + "step": 14332 + }, + { + "epoch": 0.229328, + "grad_norm": 1.0625, + "learning_rate": 7.769032258064518e-05, + "loss": 0.1826, + "step": 14333 + }, + { + "epoch": 0.229344, + "grad_norm": 0.6328125, + "learning_rate": 7.768870967741936e-05, + "loss": 0.186, + "step": 14334 + }, + { + "epoch": 0.22936, + "grad_norm": 0.7734375, + "learning_rate": 7.768709677419356e-05, + "loss": 0.1738, + "step": 14335 + }, + { + "epoch": 0.229376, + "grad_norm": 0.75390625, + "learning_rate": 7.768548387096775e-05, + "loss": 0.1638, + "step": 14336 + }, + { + "epoch": 0.229392, + "grad_norm": 0.83203125, + "learning_rate": 7.768387096774193e-05, + "loss": 0.1375, + "step": 14337 + }, + { + "epoch": 0.229408, + "grad_norm": 1.15625, + "learning_rate": 7.768225806451613e-05, + "loss": 0.1776, + "step": 14338 + }, + { + "epoch": 0.229424, + "grad_norm": 0.63671875, + "learning_rate": 7.768064516129032e-05, + "loss": 0.1605, + "step": 14339 + }, + { + "epoch": 0.22944, + "grad_norm": 0.5703125, + "learning_rate": 7.767903225806452e-05, + "loss": 0.1627, + "step": 14340 + }, + { + "epoch": 0.229456, + "grad_norm": 0.83984375, + "learning_rate": 7.76774193548387e-05, + "loss": 0.1528, + "step": 14341 + }, + { + "epoch": 0.229472, + "grad_norm": 0.84375, + "learning_rate": 7.76758064516129e-05, + "loss": 0.2002, + "step": 14342 + }, + { + "epoch": 0.229488, + "grad_norm": 0.80859375, + "learning_rate": 7.76741935483871e-05, + "loss": 0.1832, + "step": 14343 + }, + { + "epoch": 0.229504, + "grad_norm": 0.69921875, + "learning_rate": 7.76725806451613e-05, + "loss": 0.1954, + "step": 14344 + }, + { + "epoch": 0.22952, + "grad_norm": 0.6328125, + "learning_rate": 7.767096774193549e-05, + "loss": 0.1299, + "step": 14345 + }, + { + "epoch": 0.229536, + "grad_norm": 1.0078125, + "learning_rate": 7.766935483870969e-05, + "loss": 0.1758, + "step": 14346 + }, + { + "epoch": 0.229552, + "grad_norm": 0.578125, + "learning_rate": 7.766774193548388e-05, + "loss": 0.1336, + "step": 14347 + }, + { + "epoch": 0.229568, + "grad_norm": 1.2890625, + "learning_rate": 7.766612903225808e-05, + "loss": 0.1787, + "step": 14348 + }, + { + "epoch": 0.229584, + "grad_norm": 0.77734375, + "learning_rate": 7.766451612903226e-05, + "loss": 0.1977, + "step": 14349 + }, + { + "epoch": 0.2296, + "grad_norm": 0.90234375, + "learning_rate": 7.766290322580646e-05, + "loss": 0.1566, + "step": 14350 + }, + { + "epoch": 0.229616, + "grad_norm": 0.59375, + "learning_rate": 7.766129032258065e-05, + "loss": 0.1367, + "step": 14351 + }, + { + "epoch": 0.229632, + "grad_norm": 1.3359375, + "learning_rate": 7.765967741935483e-05, + "loss": 0.1629, + "step": 14352 + }, + { + "epoch": 0.229648, + "grad_norm": 0.70703125, + "learning_rate": 7.765806451612903e-05, + "loss": 0.1736, + "step": 14353 + }, + { + "epoch": 0.229664, + "grad_norm": 0.95703125, + "learning_rate": 7.765645161290322e-05, + "loss": 0.1745, + "step": 14354 + }, + { + "epoch": 0.22968, + "grad_norm": 0.94921875, + "learning_rate": 7.765483870967742e-05, + "loss": 0.2298, + "step": 14355 + }, + { + "epoch": 0.229696, + "grad_norm": 0.6796875, + "learning_rate": 7.765322580645162e-05, + "loss": 0.1674, + "step": 14356 + }, + { + "epoch": 0.229712, + "grad_norm": 0.5546875, + "learning_rate": 7.765161290322582e-05, + "loss": 0.1637, + "step": 14357 + }, + { + "epoch": 0.229728, + "grad_norm": 0.703125, + "learning_rate": 7.765e-05, + "loss": 0.1667, + "step": 14358 + }, + { + "epoch": 0.229744, + "grad_norm": 0.6796875, + "learning_rate": 7.76483870967742e-05, + "loss": 0.1639, + "step": 14359 + }, + { + "epoch": 0.22976, + "grad_norm": 1.1953125, + "learning_rate": 7.764677419354839e-05, + "loss": 0.1573, + "step": 14360 + }, + { + "epoch": 0.229776, + "grad_norm": 1.234375, + "learning_rate": 7.764516129032259e-05, + "loss": 0.1629, + "step": 14361 + }, + { + "epoch": 0.229792, + "grad_norm": 0.5234375, + "learning_rate": 7.764354838709678e-05, + "loss": 0.1499, + "step": 14362 + }, + { + "epoch": 0.229808, + "grad_norm": 1.1015625, + "learning_rate": 7.764193548387097e-05, + "loss": 0.1514, + "step": 14363 + }, + { + "epoch": 0.229824, + "grad_norm": 0.65625, + "learning_rate": 7.764032258064516e-05, + "loss": 0.1659, + "step": 14364 + }, + { + "epoch": 0.22984, + "grad_norm": 0.91015625, + "learning_rate": 7.763870967741936e-05, + "loss": 0.1821, + "step": 14365 + }, + { + "epoch": 0.229856, + "grad_norm": 0.5546875, + "learning_rate": 7.763709677419355e-05, + "loss": 0.1526, + "step": 14366 + }, + { + "epoch": 0.229872, + "grad_norm": 1.0546875, + "learning_rate": 7.763548387096775e-05, + "loss": 0.1496, + "step": 14367 + }, + { + "epoch": 0.229888, + "grad_norm": 0.7578125, + "learning_rate": 7.763387096774195e-05, + "loss": 0.1824, + "step": 14368 + }, + { + "epoch": 0.229904, + "grad_norm": 0.88671875, + "learning_rate": 7.763225806451613e-05, + "loss": 0.1826, + "step": 14369 + }, + { + "epoch": 0.22992, + "grad_norm": 1.0078125, + "learning_rate": 7.763064516129033e-05, + "loss": 0.1379, + "step": 14370 + }, + { + "epoch": 0.229936, + "grad_norm": 0.65625, + "learning_rate": 7.762903225806452e-05, + "loss": 0.1636, + "step": 14371 + }, + { + "epoch": 0.229952, + "grad_norm": 0.7734375, + "learning_rate": 7.762741935483872e-05, + "loss": 0.1744, + "step": 14372 + }, + { + "epoch": 0.229968, + "grad_norm": 0.90234375, + "learning_rate": 7.76258064516129e-05, + "loss": 0.2205, + "step": 14373 + }, + { + "epoch": 0.229984, + "grad_norm": 0.62109375, + "learning_rate": 7.76241935483871e-05, + "loss": 0.1489, + "step": 14374 + }, + { + "epoch": 0.23, + "grad_norm": 0.54296875, + "learning_rate": 7.762258064516129e-05, + "loss": 0.1665, + "step": 14375 + }, + { + "epoch": 0.230016, + "grad_norm": 0.671875, + "learning_rate": 7.762096774193549e-05, + "loss": 0.1611, + "step": 14376 + }, + { + "epoch": 0.230032, + "grad_norm": 0.59765625, + "learning_rate": 7.761935483870967e-05, + "loss": 0.1894, + "step": 14377 + }, + { + "epoch": 0.230048, + "grad_norm": 0.73828125, + "learning_rate": 7.761774193548387e-05, + "loss": 0.1515, + "step": 14378 + }, + { + "epoch": 0.230064, + "grad_norm": 1.2578125, + "learning_rate": 7.761612903225806e-05, + "loss": 0.2571, + "step": 14379 + }, + { + "epoch": 0.23008, + "grad_norm": 1.109375, + "learning_rate": 7.761451612903226e-05, + "loss": 0.1659, + "step": 14380 + }, + { + "epoch": 0.230096, + "grad_norm": 0.74609375, + "learning_rate": 7.761290322580646e-05, + "loss": 0.2038, + "step": 14381 + }, + { + "epoch": 0.230112, + "grad_norm": 0.74609375, + "learning_rate": 7.761129032258066e-05, + "loss": 0.1567, + "step": 14382 + }, + { + "epoch": 0.230128, + "grad_norm": 0.6640625, + "learning_rate": 7.760967741935485e-05, + "loss": 0.1964, + "step": 14383 + }, + { + "epoch": 0.230144, + "grad_norm": 1.0390625, + "learning_rate": 7.760806451612903e-05, + "loss": 0.1674, + "step": 14384 + }, + { + "epoch": 0.23016, + "grad_norm": 0.66015625, + "learning_rate": 7.760645161290323e-05, + "loss": 0.1744, + "step": 14385 + }, + { + "epoch": 0.230176, + "grad_norm": 0.765625, + "learning_rate": 7.760483870967742e-05, + "loss": 0.1824, + "step": 14386 + }, + { + "epoch": 0.230192, + "grad_norm": 0.67578125, + "learning_rate": 7.760322580645162e-05, + "loss": 0.1493, + "step": 14387 + }, + { + "epoch": 0.230208, + "grad_norm": 1.2421875, + "learning_rate": 7.76016129032258e-05, + "loss": 0.1837, + "step": 14388 + }, + { + "epoch": 0.230224, + "grad_norm": 0.6328125, + "learning_rate": 7.76e-05, + "loss": 0.1566, + "step": 14389 + }, + { + "epoch": 0.23024, + "grad_norm": 0.64453125, + "learning_rate": 7.759838709677419e-05, + "loss": 0.1464, + "step": 14390 + }, + { + "epoch": 0.230256, + "grad_norm": 1.171875, + "learning_rate": 7.759677419354839e-05, + "loss": 0.1542, + "step": 14391 + }, + { + "epoch": 0.230272, + "grad_norm": 0.96484375, + "learning_rate": 7.759516129032259e-05, + "loss": 0.1953, + "step": 14392 + }, + { + "epoch": 0.230288, + "grad_norm": 0.69921875, + "learning_rate": 7.759354838709679e-05, + "loss": 0.1579, + "step": 14393 + }, + { + "epoch": 0.230304, + "grad_norm": 0.93359375, + "learning_rate": 7.759193548387097e-05, + "loss": 0.1781, + "step": 14394 + }, + { + "epoch": 0.23032, + "grad_norm": 0.73046875, + "learning_rate": 7.759032258064517e-05, + "loss": 0.1894, + "step": 14395 + }, + { + "epoch": 0.230336, + "grad_norm": 1.1015625, + "learning_rate": 7.758870967741936e-05, + "loss": 0.153, + "step": 14396 + }, + { + "epoch": 0.230352, + "grad_norm": 0.921875, + "learning_rate": 7.758709677419356e-05, + "loss": 0.1984, + "step": 14397 + }, + { + "epoch": 0.230368, + "grad_norm": 0.6875, + "learning_rate": 7.758548387096774e-05, + "loss": 0.1479, + "step": 14398 + }, + { + "epoch": 0.230384, + "grad_norm": 0.765625, + "learning_rate": 7.758387096774193e-05, + "loss": 0.1515, + "step": 14399 + }, + { + "epoch": 0.2304, + "grad_norm": 0.85546875, + "learning_rate": 7.758225806451613e-05, + "loss": 0.1526, + "step": 14400 + }, + { + "epoch": 0.230416, + "grad_norm": 1.0703125, + "learning_rate": 7.758064516129032e-05, + "loss": 0.1755, + "step": 14401 + }, + { + "epoch": 0.230432, + "grad_norm": 0.7421875, + "learning_rate": 7.757903225806452e-05, + "loss": 0.178, + "step": 14402 + }, + { + "epoch": 0.230448, + "grad_norm": 1.0546875, + "learning_rate": 7.757741935483872e-05, + "loss": 0.1545, + "step": 14403 + }, + { + "epoch": 0.230464, + "grad_norm": 0.73046875, + "learning_rate": 7.757580645161292e-05, + "loss": 0.1746, + "step": 14404 + }, + { + "epoch": 0.23048, + "grad_norm": 0.67578125, + "learning_rate": 7.75741935483871e-05, + "loss": 0.1725, + "step": 14405 + }, + { + "epoch": 0.230496, + "grad_norm": 0.77734375, + "learning_rate": 7.75725806451613e-05, + "loss": 0.1845, + "step": 14406 + }, + { + "epoch": 0.230512, + "grad_norm": 1.0703125, + "learning_rate": 7.757096774193549e-05, + "loss": 0.2336, + "step": 14407 + }, + { + "epoch": 0.230528, + "grad_norm": 0.5390625, + "learning_rate": 7.756935483870969e-05, + "loss": 0.1655, + "step": 14408 + }, + { + "epoch": 0.230544, + "grad_norm": 0.625, + "learning_rate": 7.756774193548387e-05, + "loss": 0.1671, + "step": 14409 + }, + { + "epoch": 0.23056, + "grad_norm": 0.984375, + "learning_rate": 7.756612903225807e-05, + "loss": 0.1615, + "step": 14410 + }, + { + "epoch": 0.230576, + "grad_norm": 0.87890625, + "learning_rate": 7.756451612903226e-05, + "loss": 0.1848, + "step": 14411 + }, + { + "epoch": 0.230592, + "grad_norm": 1.0234375, + "learning_rate": 7.756290322580646e-05, + "loss": 0.2085, + "step": 14412 + }, + { + "epoch": 0.230608, + "grad_norm": 0.609375, + "learning_rate": 7.756129032258064e-05, + "loss": 0.1451, + "step": 14413 + }, + { + "epoch": 0.230624, + "grad_norm": 1.3359375, + "learning_rate": 7.755967741935483e-05, + "loss": 0.146, + "step": 14414 + }, + { + "epoch": 0.23064, + "grad_norm": 0.56640625, + "learning_rate": 7.755806451612903e-05, + "loss": 0.1391, + "step": 14415 + }, + { + "epoch": 0.230656, + "grad_norm": 1.140625, + "learning_rate": 7.755645161290323e-05, + "loss": 0.1422, + "step": 14416 + }, + { + "epoch": 0.230672, + "grad_norm": 1.0703125, + "learning_rate": 7.755483870967743e-05, + "loss": 0.1551, + "step": 14417 + }, + { + "epoch": 0.230688, + "grad_norm": 1.265625, + "learning_rate": 7.755322580645162e-05, + "loss": 0.2078, + "step": 14418 + }, + { + "epoch": 0.230704, + "grad_norm": 1.0234375, + "learning_rate": 7.755161290322582e-05, + "loss": 0.1725, + "step": 14419 + }, + { + "epoch": 0.23072, + "grad_norm": 0.69921875, + "learning_rate": 7.755e-05, + "loss": 0.1655, + "step": 14420 + }, + { + "epoch": 0.230736, + "grad_norm": 0.94921875, + "learning_rate": 7.75483870967742e-05, + "loss": 0.1667, + "step": 14421 + }, + { + "epoch": 0.230752, + "grad_norm": 0.71484375, + "learning_rate": 7.754677419354839e-05, + "loss": 0.1839, + "step": 14422 + }, + { + "epoch": 0.230768, + "grad_norm": 0.60546875, + "learning_rate": 7.754516129032259e-05, + "loss": 0.1616, + "step": 14423 + }, + { + "epoch": 0.230784, + "grad_norm": 1.265625, + "learning_rate": 7.754354838709677e-05, + "loss": 0.1426, + "step": 14424 + }, + { + "epoch": 0.2308, + "grad_norm": 0.89453125, + "learning_rate": 7.754193548387097e-05, + "loss": 0.2025, + "step": 14425 + }, + { + "epoch": 0.230816, + "grad_norm": 0.82421875, + "learning_rate": 7.754032258064516e-05, + "loss": 0.178, + "step": 14426 + }, + { + "epoch": 0.230832, + "grad_norm": 1.0859375, + "learning_rate": 7.753870967741936e-05, + "loss": 0.2071, + "step": 14427 + }, + { + "epoch": 0.230848, + "grad_norm": 0.74609375, + "learning_rate": 7.753709677419356e-05, + "loss": 0.125, + "step": 14428 + }, + { + "epoch": 0.230864, + "grad_norm": 0.74609375, + "learning_rate": 7.753548387096776e-05, + "loss": 0.1853, + "step": 14429 + }, + { + "epoch": 0.23088, + "grad_norm": 0.640625, + "learning_rate": 7.753387096774194e-05, + "loss": 0.1772, + "step": 14430 + }, + { + "epoch": 0.230896, + "grad_norm": 0.7734375, + "learning_rate": 7.753225806451613e-05, + "loss": 0.1919, + "step": 14431 + }, + { + "epoch": 0.230912, + "grad_norm": 0.671875, + "learning_rate": 7.753064516129033e-05, + "loss": 0.1737, + "step": 14432 + }, + { + "epoch": 0.230928, + "grad_norm": 0.9140625, + "learning_rate": 7.752903225806452e-05, + "loss": 0.1985, + "step": 14433 + }, + { + "epoch": 0.230944, + "grad_norm": 0.74609375, + "learning_rate": 7.752741935483871e-05, + "loss": 0.1593, + "step": 14434 + }, + { + "epoch": 0.23096, + "grad_norm": 1.015625, + "learning_rate": 7.75258064516129e-05, + "loss": 0.2084, + "step": 14435 + }, + { + "epoch": 0.230976, + "grad_norm": 0.87890625, + "learning_rate": 7.75241935483871e-05, + "loss": 0.1757, + "step": 14436 + }, + { + "epoch": 0.230992, + "grad_norm": 0.68359375, + "learning_rate": 7.752258064516129e-05, + "loss": 0.1492, + "step": 14437 + }, + { + "epoch": 0.231008, + "grad_norm": 0.62109375, + "learning_rate": 7.752096774193549e-05, + "loss": 0.1771, + "step": 14438 + }, + { + "epoch": 0.231024, + "grad_norm": 0.76171875, + "learning_rate": 7.751935483870969e-05, + "loss": 0.1808, + "step": 14439 + }, + { + "epoch": 0.23104, + "grad_norm": 0.75390625, + "learning_rate": 7.751774193548387e-05, + "loss": 0.1813, + "step": 14440 + }, + { + "epoch": 0.231056, + "grad_norm": 0.6484375, + "learning_rate": 7.751612903225807e-05, + "loss": 0.1994, + "step": 14441 + }, + { + "epoch": 0.231072, + "grad_norm": 0.84765625, + "learning_rate": 7.751451612903227e-05, + "loss": 0.1428, + "step": 14442 + }, + { + "epoch": 0.231088, + "grad_norm": 0.6015625, + "learning_rate": 7.751290322580646e-05, + "loss": 0.1227, + "step": 14443 + }, + { + "epoch": 0.231104, + "grad_norm": 0.625, + "learning_rate": 7.751129032258066e-05, + "loss": 0.1983, + "step": 14444 + }, + { + "epoch": 0.23112, + "grad_norm": 0.93359375, + "learning_rate": 7.750967741935484e-05, + "loss": 0.1265, + "step": 14445 + }, + { + "epoch": 0.231136, + "grad_norm": 0.828125, + "learning_rate": 7.750806451612903e-05, + "loss": 0.1654, + "step": 14446 + }, + { + "epoch": 0.231152, + "grad_norm": 0.87890625, + "learning_rate": 7.750645161290323e-05, + "loss": 0.1845, + "step": 14447 + }, + { + "epoch": 0.231168, + "grad_norm": 0.98046875, + "learning_rate": 7.750483870967741e-05, + "loss": 0.1949, + "step": 14448 + }, + { + "epoch": 0.231184, + "grad_norm": 0.796875, + "learning_rate": 7.750322580645161e-05, + "loss": 0.1503, + "step": 14449 + }, + { + "epoch": 0.2312, + "grad_norm": 0.66015625, + "learning_rate": 7.75016129032258e-05, + "loss": 0.1518, + "step": 14450 + }, + { + "epoch": 0.231216, + "grad_norm": 1.2265625, + "learning_rate": 7.75e-05, + "loss": 0.2024, + "step": 14451 + }, + { + "epoch": 0.231232, + "grad_norm": 0.79296875, + "learning_rate": 7.74983870967742e-05, + "loss": 0.1548, + "step": 14452 + }, + { + "epoch": 0.231248, + "grad_norm": 1.015625, + "learning_rate": 7.74967741935484e-05, + "loss": 0.1597, + "step": 14453 + }, + { + "epoch": 0.231264, + "grad_norm": 0.87109375, + "learning_rate": 7.749516129032259e-05, + "loss": 0.1715, + "step": 14454 + }, + { + "epoch": 0.23128, + "grad_norm": 0.4921875, + "learning_rate": 7.749354838709678e-05, + "loss": 0.1572, + "step": 14455 + }, + { + "epoch": 0.231296, + "grad_norm": 0.92578125, + "learning_rate": 7.749193548387097e-05, + "loss": 0.1655, + "step": 14456 + }, + { + "epoch": 0.231312, + "grad_norm": 0.64453125, + "learning_rate": 7.749032258064517e-05, + "loss": 0.1692, + "step": 14457 + }, + { + "epoch": 0.231328, + "grad_norm": 1.0234375, + "learning_rate": 7.748870967741936e-05, + "loss": 0.1961, + "step": 14458 + }, + { + "epoch": 0.231344, + "grad_norm": 1.078125, + "learning_rate": 7.748709677419356e-05, + "loss": 0.1531, + "step": 14459 + }, + { + "epoch": 0.23136, + "grad_norm": 0.6796875, + "learning_rate": 7.748548387096774e-05, + "loss": 0.1836, + "step": 14460 + }, + { + "epoch": 0.231376, + "grad_norm": 0.57421875, + "learning_rate": 7.748387096774193e-05, + "loss": 0.1415, + "step": 14461 + }, + { + "epoch": 0.231392, + "grad_norm": 0.57421875, + "learning_rate": 7.748225806451613e-05, + "loss": 0.1721, + "step": 14462 + }, + { + "epoch": 0.231408, + "grad_norm": 0.6796875, + "learning_rate": 7.748064516129033e-05, + "loss": 0.1948, + "step": 14463 + }, + { + "epoch": 0.231424, + "grad_norm": 0.7578125, + "learning_rate": 7.747903225806453e-05, + "loss": 0.1643, + "step": 14464 + }, + { + "epoch": 0.23144, + "grad_norm": 0.8671875, + "learning_rate": 7.747741935483871e-05, + "loss": 0.1712, + "step": 14465 + }, + { + "epoch": 0.231456, + "grad_norm": 0.828125, + "learning_rate": 7.747580645161291e-05, + "loss": 0.2554, + "step": 14466 + }, + { + "epoch": 0.231472, + "grad_norm": 0.80078125, + "learning_rate": 7.74741935483871e-05, + "loss": 0.1651, + "step": 14467 + }, + { + "epoch": 0.231488, + "grad_norm": 1.234375, + "learning_rate": 7.74725806451613e-05, + "loss": 0.1968, + "step": 14468 + }, + { + "epoch": 0.231504, + "grad_norm": 0.6015625, + "learning_rate": 7.747096774193548e-05, + "loss": 0.1687, + "step": 14469 + }, + { + "epoch": 0.23152, + "grad_norm": 1.015625, + "learning_rate": 7.746935483870968e-05, + "loss": 0.1871, + "step": 14470 + }, + { + "epoch": 0.231536, + "grad_norm": 0.734375, + "learning_rate": 7.746774193548387e-05, + "loss": 0.1535, + "step": 14471 + }, + { + "epoch": 0.231552, + "grad_norm": 0.875, + "learning_rate": 7.746612903225807e-05, + "loss": 0.1916, + "step": 14472 + }, + { + "epoch": 0.231568, + "grad_norm": 0.6484375, + "learning_rate": 7.746451612903226e-05, + "loss": 0.1911, + "step": 14473 + }, + { + "epoch": 0.231584, + "grad_norm": 0.8359375, + "learning_rate": 7.746290322580646e-05, + "loss": 0.182, + "step": 14474 + }, + { + "epoch": 0.2316, + "grad_norm": 0.71484375, + "learning_rate": 7.746129032258064e-05, + "loss": 0.1649, + "step": 14475 + }, + { + "epoch": 0.231616, + "grad_norm": 0.91015625, + "learning_rate": 7.745967741935484e-05, + "loss": 0.1774, + "step": 14476 + }, + { + "epoch": 0.231632, + "grad_norm": 0.59765625, + "learning_rate": 7.745806451612904e-05, + "loss": 0.1799, + "step": 14477 + }, + { + "epoch": 0.231648, + "grad_norm": 0.48046875, + "learning_rate": 7.745645161290323e-05, + "loss": 0.1462, + "step": 14478 + }, + { + "epoch": 0.231664, + "grad_norm": 1.21875, + "learning_rate": 7.745483870967743e-05, + "loss": 0.1887, + "step": 14479 + }, + { + "epoch": 0.23168, + "grad_norm": 0.6796875, + "learning_rate": 7.745322580645161e-05, + "loss": 0.1543, + "step": 14480 + }, + { + "epoch": 0.231696, + "grad_norm": 0.79296875, + "learning_rate": 7.745161290322581e-05, + "loss": 0.1991, + "step": 14481 + }, + { + "epoch": 0.231712, + "grad_norm": 0.78515625, + "learning_rate": 7.745e-05, + "loss": 0.1688, + "step": 14482 + }, + { + "epoch": 0.231728, + "grad_norm": 0.9921875, + "learning_rate": 7.74483870967742e-05, + "loss": 0.2674, + "step": 14483 + }, + { + "epoch": 0.231744, + "grad_norm": 0.5078125, + "learning_rate": 7.744677419354838e-05, + "loss": 0.1581, + "step": 14484 + }, + { + "epoch": 0.23176, + "grad_norm": 0.6796875, + "learning_rate": 7.744516129032258e-05, + "loss": 0.2153, + "step": 14485 + }, + { + "epoch": 0.231776, + "grad_norm": 1.6015625, + "learning_rate": 7.744354838709677e-05, + "loss": 0.1777, + "step": 14486 + }, + { + "epoch": 0.231792, + "grad_norm": 0.82421875, + "learning_rate": 7.744193548387097e-05, + "loss": 0.2189, + "step": 14487 + }, + { + "epoch": 0.231808, + "grad_norm": 0.6015625, + "learning_rate": 7.744032258064517e-05, + "loss": 0.1745, + "step": 14488 + }, + { + "epoch": 0.231824, + "grad_norm": 0.80859375, + "learning_rate": 7.743870967741937e-05, + "loss": 0.1517, + "step": 14489 + }, + { + "epoch": 0.23184, + "grad_norm": 0.59765625, + "learning_rate": 7.743709677419356e-05, + "loss": 0.1654, + "step": 14490 + }, + { + "epoch": 0.231856, + "grad_norm": 0.984375, + "learning_rate": 7.743548387096775e-05, + "loss": 0.1645, + "step": 14491 + }, + { + "epoch": 0.231872, + "grad_norm": 0.65234375, + "learning_rate": 7.743387096774194e-05, + "loss": 0.1802, + "step": 14492 + }, + { + "epoch": 0.231888, + "grad_norm": 0.74609375, + "learning_rate": 7.743225806451613e-05, + "loss": 0.1787, + "step": 14493 + }, + { + "epoch": 0.231904, + "grad_norm": 0.89453125, + "learning_rate": 7.743064516129033e-05, + "loss": 0.2303, + "step": 14494 + }, + { + "epoch": 0.23192, + "grad_norm": 0.81640625, + "learning_rate": 7.742903225806451e-05, + "loss": 0.1993, + "step": 14495 + }, + { + "epoch": 0.231936, + "grad_norm": 0.71875, + "learning_rate": 7.742741935483871e-05, + "loss": 0.1883, + "step": 14496 + }, + { + "epoch": 0.231952, + "grad_norm": 0.703125, + "learning_rate": 7.74258064516129e-05, + "loss": 0.19, + "step": 14497 + }, + { + "epoch": 0.231968, + "grad_norm": 0.609375, + "learning_rate": 7.74241935483871e-05, + "loss": 0.1695, + "step": 14498 + }, + { + "epoch": 0.231984, + "grad_norm": 1.125, + "learning_rate": 7.74225806451613e-05, + "loss": 0.1933, + "step": 14499 + }, + { + "epoch": 0.232, + "grad_norm": 0.75, + "learning_rate": 7.74209677419355e-05, + "loss": 0.1703, + "step": 14500 + }, + { + "epoch": 0.232016, + "grad_norm": 0.59765625, + "learning_rate": 7.741935483870968e-05, + "loss": 0.1623, + "step": 14501 + }, + { + "epoch": 0.232032, + "grad_norm": 0.83984375, + "learning_rate": 7.741774193548388e-05, + "loss": 0.2099, + "step": 14502 + }, + { + "epoch": 0.232048, + "grad_norm": 0.6796875, + "learning_rate": 7.741612903225807e-05, + "loss": 0.1847, + "step": 14503 + }, + { + "epoch": 0.232064, + "grad_norm": 0.89453125, + "learning_rate": 7.741451612903227e-05, + "loss": 0.157, + "step": 14504 + }, + { + "epoch": 0.23208, + "grad_norm": 0.84375, + "learning_rate": 7.741290322580645e-05, + "loss": 0.1759, + "step": 14505 + }, + { + "epoch": 0.232096, + "grad_norm": 0.5859375, + "learning_rate": 7.741129032258065e-05, + "loss": 0.1788, + "step": 14506 + }, + { + "epoch": 0.232112, + "grad_norm": 0.74609375, + "learning_rate": 7.740967741935484e-05, + "loss": 0.2113, + "step": 14507 + }, + { + "epoch": 0.232128, + "grad_norm": 0.8359375, + "learning_rate": 7.740806451612903e-05, + "loss": 0.1739, + "step": 14508 + }, + { + "epoch": 0.232144, + "grad_norm": 0.70703125, + "learning_rate": 7.740645161290323e-05, + "loss": 0.2099, + "step": 14509 + }, + { + "epoch": 0.23216, + "grad_norm": 0.8046875, + "learning_rate": 7.740483870967741e-05, + "loss": 0.1731, + "step": 14510 + }, + { + "epoch": 0.232176, + "grad_norm": 0.75390625, + "learning_rate": 7.740322580645161e-05, + "loss": 0.172, + "step": 14511 + }, + { + "epoch": 0.232192, + "grad_norm": 0.921875, + "learning_rate": 7.740161290322581e-05, + "loss": 0.1707, + "step": 14512 + }, + { + "epoch": 0.232208, + "grad_norm": 0.9921875, + "learning_rate": 7.740000000000001e-05, + "loss": 0.147, + "step": 14513 + }, + { + "epoch": 0.232224, + "grad_norm": 0.734375, + "learning_rate": 7.73983870967742e-05, + "loss": 0.1861, + "step": 14514 + }, + { + "epoch": 0.23224, + "grad_norm": 0.66796875, + "learning_rate": 7.73967741935484e-05, + "loss": 0.154, + "step": 14515 + }, + { + "epoch": 0.232256, + "grad_norm": 0.81640625, + "learning_rate": 7.739516129032258e-05, + "loss": 0.1795, + "step": 14516 + }, + { + "epoch": 0.232272, + "grad_norm": 0.63671875, + "learning_rate": 7.739354838709678e-05, + "loss": 0.1219, + "step": 14517 + }, + { + "epoch": 0.232288, + "grad_norm": 0.73046875, + "learning_rate": 7.739193548387097e-05, + "loss": 0.1978, + "step": 14518 + }, + { + "epoch": 0.232304, + "grad_norm": 0.7578125, + "learning_rate": 7.739032258064517e-05, + "loss": 0.157, + "step": 14519 + }, + { + "epoch": 0.23232, + "grad_norm": 0.46875, + "learning_rate": 7.738870967741935e-05, + "loss": 0.157, + "step": 14520 + }, + { + "epoch": 0.232336, + "grad_norm": 1.1640625, + "learning_rate": 7.738709677419355e-05, + "loss": 0.1395, + "step": 14521 + }, + { + "epoch": 0.232352, + "grad_norm": 0.68359375, + "learning_rate": 7.738548387096774e-05, + "loss": 0.1759, + "step": 14522 + }, + { + "epoch": 0.232368, + "grad_norm": 0.66015625, + "learning_rate": 7.738387096774194e-05, + "loss": 0.1716, + "step": 14523 + }, + { + "epoch": 0.232384, + "grad_norm": 1.0859375, + "learning_rate": 7.738225806451614e-05, + "loss": 0.1425, + "step": 14524 + }, + { + "epoch": 0.2324, + "grad_norm": 0.9609375, + "learning_rate": 7.738064516129033e-05, + "loss": 0.187, + "step": 14525 + }, + { + "epoch": 0.232416, + "grad_norm": 0.48828125, + "learning_rate": 7.737903225806452e-05, + "loss": 0.1543, + "step": 14526 + }, + { + "epoch": 0.232432, + "grad_norm": 0.73828125, + "learning_rate": 7.737741935483871e-05, + "loss": 0.1479, + "step": 14527 + }, + { + "epoch": 0.232448, + "grad_norm": 0.53125, + "learning_rate": 7.737580645161291e-05, + "loss": 0.1621, + "step": 14528 + }, + { + "epoch": 0.232464, + "grad_norm": 0.59765625, + "learning_rate": 7.73741935483871e-05, + "loss": 0.1518, + "step": 14529 + }, + { + "epoch": 0.23248, + "grad_norm": 0.62890625, + "learning_rate": 7.73725806451613e-05, + "loss": 0.1883, + "step": 14530 + }, + { + "epoch": 0.232496, + "grad_norm": 1.0078125, + "learning_rate": 7.737096774193548e-05, + "loss": 0.1953, + "step": 14531 + }, + { + "epoch": 0.232512, + "grad_norm": 0.81640625, + "learning_rate": 7.736935483870968e-05, + "loss": 0.1689, + "step": 14532 + }, + { + "epoch": 0.232528, + "grad_norm": 0.65625, + "learning_rate": 7.736774193548387e-05, + "loss": 0.1612, + "step": 14533 + }, + { + "epoch": 0.232544, + "grad_norm": 1.015625, + "learning_rate": 7.736612903225807e-05, + "loss": 0.1472, + "step": 14534 + }, + { + "epoch": 0.23256, + "grad_norm": 0.70703125, + "learning_rate": 7.736451612903225e-05, + "loss": 0.1515, + "step": 14535 + }, + { + "epoch": 0.232576, + "grad_norm": 0.921875, + "learning_rate": 7.736290322580645e-05, + "loss": 0.1305, + "step": 14536 + }, + { + "epoch": 0.232592, + "grad_norm": 0.890625, + "learning_rate": 7.736129032258065e-05, + "loss": 0.1584, + "step": 14537 + }, + { + "epoch": 0.232608, + "grad_norm": 0.8359375, + "learning_rate": 7.735967741935485e-05, + "loss": 0.15, + "step": 14538 + }, + { + "epoch": 0.232624, + "grad_norm": 0.9921875, + "learning_rate": 7.735806451612904e-05, + "loss": 0.1547, + "step": 14539 + }, + { + "epoch": 0.23264, + "grad_norm": 0.78125, + "learning_rate": 7.735645161290322e-05, + "loss": 0.1404, + "step": 14540 + }, + { + "epoch": 0.232656, + "grad_norm": 0.6953125, + "learning_rate": 7.735483870967742e-05, + "loss": 0.1914, + "step": 14541 + }, + { + "epoch": 0.232672, + "grad_norm": 0.62890625, + "learning_rate": 7.735322580645161e-05, + "loss": 0.1555, + "step": 14542 + }, + { + "epoch": 0.232688, + "grad_norm": 0.6484375, + "learning_rate": 7.735161290322581e-05, + "loss": 0.1751, + "step": 14543 + }, + { + "epoch": 0.232704, + "grad_norm": 0.96875, + "learning_rate": 7.735e-05, + "loss": 0.1863, + "step": 14544 + }, + { + "epoch": 0.23272, + "grad_norm": 0.8125, + "learning_rate": 7.73483870967742e-05, + "loss": 0.1908, + "step": 14545 + }, + { + "epoch": 0.232736, + "grad_norm": 0.65625, + "learning_rate": 7.734677419354838e-05, + "loss": 0.178, + "step": 14546 + }, + { + "epoch": 0.232752, + "grad_norm": 0.66796875, + "learning_rate": 7.734516129032258e-05, + "loss": 0.1412, + "step": 14547 + }, + { + "epoch": 0.232768, + "grad_norm": 0.73828125, + "learning_rate": 7.734354838709678e-05, + "loss": 0.1686, + "step": 14548 + }, + { + "epoch": 0.232784, + "grad_norm": 0.7890625, + "learning_rate": 7.734193548387098e-05, + "loss": 0.1458, + "step": 14549 + }, + { + "epoch": 0.2328, + "grad_norm": 0.72265625, + "learning_rate": 7.734032258064517e-05, + "loss": 0.1467, + "step": 14550 + }, + { + "epoch": 0.232816, + "grad_norm": 0.94140625, + "learning_rate": 7.733870967741937e-05, + "loss": 0.2078, + "step": 14551 + }, + { + "epoch": 0.232832, + "grad_norm": 0.6484375, + "learning_rate": 7.733709677419355e-05, + "loss": 0.1771, + "step": 14552 + }, + { + "epoch": 0.232848, + "grad_norm": 0.77734375, + "learning_rate": 7.733548387096775e-05, + "loss": 0.188, + "step": 14553 + }, + { + "epoch": 0.232864, + "grad_norm": 0.8984375, + "learning_rate": 7.733387096774194e-05, + "loss": 0.1963, + "step": 14554 + }, + { + "epoch": 0.23288, + "grad_norm": 0.890625, + "learning_rate": 7.733225806451612e-05, + "loss": 0.1905, + "step": 14555 + }, + { + "epoch": 0.232896, + "grad_norm": 0.53515625, + "learning_rate": 7.733064516129032e-05, + "loss": 0.1442, + "step": 14556 + }, + { + "epoch": 0.232912, + "grad_norm": 0.921875, + "learning_rate": 7.732903225806451e-05, + "loss": 0.1807, + "step": 14557 + }, + { + "epoch": 0.232928, + "grad_norm": 0.6328125, + "learning_rate": 7.732741935483871e-05, + "loss": 0.1952, + "step": 14558 + }, + { + "epoch": 0.232944, + "grad_norm": 0.7109375, + "learning_rate": 7.732580645161291e-05, + "loss": 0.1831, + "step": 14559 + }, + { + "epoch": 0.23296, + "grad_norm": 0.6875, + "learning_rate": 7.732419354838711e-05, + "loss": 0.1819, + "step": 14560 + }, + { + "epoch": 0.232976, + "grad_norm": 0.85546875, + "learning_rate": 7.73225806451613e-05, + "loss": 0.1848, + "step": 14561 + }, + { + "epoch": 0.232992, + "grad_norm": 0.625, + "learning_rate": 7.73209677419355e-05, + "loss": 0.1649, + "step": 14562 + }, + { + "epoch": 0.233008, + "grad_norm": 0.9921875, + "learning_rate": 7.731935483870968e-05, + "loss": 0.2448, + "step": 14563 + }, + { + "epoch": 0.233024, + "grad_norm": 0.7890625, + "learning_rate": 7.731774193548388e-05, + "loss": 0.197, + "step": 14564 + }, + { + "epoch": 0.23304, + "grad_norm": 0.6484375, + "learning_rate": 7.731612903225807e-05, + "loss": 0.175, + "step": 14565 + }, + { + "epoch": 0.233056, + "grad_norm": 0.86328125, + "learning_rate": 7.731451612903227e-05, + "loss": 0.2002, + "step": 14566 + }, + { + "epoch": 0.233072, + "grad_norm": 1.140625, + "learning_rate": 7.731290322580645e-05, + "loss": 0.175, + "step": 14567 + }, + { + "epoch": 0.233088, + "grad_norm": 1.1953125, + "learning_rate": 7.731129032258065e-05, + "loss": 0.1962, + "step": 14568 + }, + { + "epoch": 0.233104, + "grad_norm": 0.80859375, + "learning_rate": 7.730967741935484e-05, + "loss": 0.2008, + "step": 14569 + }, + { + "epoch": 0.23312, + "grad_norm": 0.859375, + "learning_rate": 7.730806451612902e-05, + "loss": 0.1255, + "step": 14570 + }, + { + "epoch": 0.233136, + "grad_norm": 0.79296875, + "learning_rate": 7.730645161290322e-05, + "loss": 0.1599, + "step": 14571 + }, + { + "epoch": 0.233152, + "grad_norm": 0.96484375, + "learning_rate": 7.730483870967742e-05, + "loss": 0.2022, + "step": 14572 + }, + { + "epoch": 0.233168, + "grad_norm": 0.58984375, + "learning_rate": 7.730322580645162e-05, + "loss": 0.1465, + "step": 14573 + }, + { + "epoch": 0.233184, + "grad_norm": 1.046875, + "learning_rate": 7.730161290322581e-05, + "loss": 0.1668, + "step": 14574 + }, + { + "epoch": 0.2332, + "grad_norm": 0.5703125, + "learning_rate": 7.730000000000001e-05, + "loss": 0.1568, + "step": 14575 + }, + { + "epoch": 0.233216, + "grad_norm": 0.9765625, + "learning_rate": 7.72983870967742e-05, + "loss": 0.1804, + "step": 14576 + }, + { + "epoch": 0.233232, + "grad_norm": 0.78515625, + "learning_rate": 7.72967741935484e-05, + "loss": 0.1706, + "step": 14577 + }, + { + "epoch": 0.233248, + "grad_norm": 1.0, + "learning_rate": 7.729516129032258e-05, + "loss": 0.1963, + "step": 14578 + }, + { + "epoch": 0.233264, + "grad_norm": 0.921875, + "learning_rate": 7.729354838709678e-05, + "loss": 0.2005, + "step": 14579 + }, + { + "epoch": 0.23328, + "grad_norm": 0.7734375, + "learning_rate": 7.729193548387097e-05, + "loss": 0.1567, + "step": 14580 + }, + { + "epoch": 0.233296, + "grad_norm": 0.97265625, + "learning_rate": 7.729032258064517e-05, + "loss": 0.2156, + "step": 14581 + }, + { + "epoch": 0.233312, + "grad_norm": 0.88671875, + "learning_rate": 7.728870967741935e-05, + "loss": 0.1833, + "step": 14582 + }, + { + "epoch": 0.233328, + "grad_norm": 0.8203125, + "learning_rate": 7.728709677419355e-05, + "loss": 0.1325, + "step": 14583 + }, + { + "epoch": 0.233344, + "grad_norm": 0.77734375, + "learning_rate": 7.728548387096775e-05, + "loss": 0.1894, + "step": 14584 + }, + { + "epoch": 0.23336, + "grad_norm": 0.9921875, + "learning_rate": 7.728387096774194e-05, + "loss": 0.1633, + "step": 14585 + }, + { + "epoch": 0.233376, + "grad_norm": 1.6171875, + "learning_rate": 7.728225806451614e-05, + "loss": 0.1768, + "step": 14586 + }, + { + "epoch": 0.233392, + "grad_norm": 1.0078125, + "learning_rate": 7.728064516129032e-05, + "loss": 0.1712, + "step": 14587 + }, + { + "epoch": 0.233408, + "grad_norm": 1.0859375, + "learning_rate": 7.727903225806452e-05, + "loss": 0.1447, + "step": 14588 + }, + { + "epoch": 0.233424, + "grad_norm": 0.82421875, + "learning_rate": 7.727741935483871e-05, + "loss": 0.1785, + "step": 14589 + }, + { + "epoch": 0.23344, + "grad_norm": 0.93359375, + "learning_rate": 7.727580645161291e-05, + "loss": 0.1463, + "step": 14590 + }, + { + "epoch": 0.233456, + "grad_norm": 0.52734375, + "learning_rate": 7.72741935483871e-05, + "loss": 0.1514, + "step": 14591 + }, + { + "epoch": 0.233472, + "grad_norm": 0.7109375, + "learning_rate": 7.72725806451613e-05, + "loss": 0.1431, + "step": 14592 + }, + { + "epoch": 0.233488, + "grad_norm": 0.80078125, + "learning_rate": 7.727096774193548e-05, + "loss": 0.1827, + "step": 14593 + }, + { + "epoch": 0.233504, + "grad_norm": 0.69140625, + "learning_rate": 7.726935483870968e-05, + "loss": 0.1521, + "step": 14594 + }, + { + "epoch": 0.23352, + "grad_norm": 0.80859375, + "learning_rate": 7.726774193548388e-05, + "loss": 0.1825, + "step": 14595 + }, + { + "epoch": 0.233536, + "grad_norm": 1.2421875, + "learning_rate": 7.726612903225808e-05, + "loss": 0.1217, + "step": 14596 + }, + { + "epoch": 0.233552, + "grad_norm": 0.65625, + "learning_rate": 7.726451612903227e-05, + "loss": 0.1756, + "step": 14597 + }, + { + "epoch": 0.233568, + "grad_norm": 0.87890625, + "learning_rate": 7.726290322580646e-05, + "loss": 0.149, + "step": 14598 + }, + { + "epoch": 0.233584, + "grad_norm": 0.61328125, + "learning_rate": 7.726129032258065e-05, + "loss": 0.1743, + "step": 14599 + }, + { + "epoch": 0.2336, + "grad_norm": 0.9296875, + "learning_rate": 7.725967741935485e-05, + "loss": 0.1392, + "step": 14600 + }, + { + "epoch": 0.233616, + "grad_norm": 0.62890625, + "learning_rate": 7.725806451612904e-05, + "loss": 0.1416, + "step": 14601 + }, + { + "epoch": 0.233632, + "grad_norm": 1.1484375, + "learning_rate": 7.725645161290322e-05, + "loss": 0.2279, + "step": 14602 + }, + { + "epoch": 0.233648, + "grad_norm": 0.51953125, + "learning_rate": 7.725483870967742e-05, + "loss": 0.1385, + "step": 14603 + }, + { + "epoch": 0.233664, + "grad_norm": 0.5390625, + "learning_rate": 7.725322580645161e-05, + "loss": 0.1738, + "step": 14604 + }, + { + "epoch": 0.23368, + "grad_norm": 0.70703125, + "learning_rate": 7.725161290322581e-05, + "loss": 0.1748, + "step": 14605 + }, + { + "epoch": 0.233696, + "grad_norm": 0.890625, + "learning_rate": 7.725e-05, + "loss": 0.1617, + "step": 14606 + }, + { + "epoch": 0.233712, + "grad_norm": 0.84375, + "learning_rate": 7.72483870967742e-05, + "loss": 0.2016, + "step": 14607 + }, + { + "epoch": 0.233728, + "grad_norm": 0.4921875, + "learning_rate": 7.724677419354839e-05, + "loss": 0.1475, + "step": 14608 + }, + { + "epoch": 0.233744, + "grad_norm": 0.57421875, + "learning_rate": 7.724516129032259e-05, + "loss": 0.1603, + "step": 14609 + }, + { + "epoch": 0.23376, + "grad_norm": 0.46484375, + "learning_rate": 7.724354838709678e-05, + "loss": 0.1538, + "step": 14610 + }, + { + "epoch": 0.233776, + "grad_norm": 0.5703125, + "learning_rate": 7.724193548387098e-05, + "loss": 0.1736, + "step": 14611 + }, + { + "epoch": 0.233792, + "grad_norm": 1.0390625, + "learning_rate": 7.724032258064516e-05, + "loss": 0.1441, + "step": 14612 + }, + { + "epoch": 0.233808, + "grad_norm": 0.9765625, + "learning_rate": 7.723870967741936e-05, + "loss": 0.152, + "step": 14613 + }, + { + "epoch": 0.233824, + "grad_norm": 0.71875, + "learning_rate": 7.723709677419355e-05, + "loss": 0.1808, + "step": 14614 + }, + { + "epoch": 0.23384, + "grad_norm": 0.50390625, + "learning_rate": 7.723548387096775e-05, + "loss": 0.1889, + "step": 14615 + }, + { + "epoch": 0.233856, + "grad_norm": 1.2421875, + "learning_rate": 7.723387096774194e-05, + "loss": 0.1947, + "step": 14616 + }, + { + "epoch": 0.233872, + "grad_norm": 0.87890625, + "learning_rate": 7.723225806451612e-05, + "loss": 0.1778, + "step": 14617 + }, + { + "epoch": 0.233888, + "grad_norm": 1.046875, + "learning_rate": 7.723064516129032e-05, + "loss": 0.1811, + "step": 14618 + }, + { + "epoch": 0.233904, + "grad_norm": 1.1328125, + "learning_rate": 7.722903225806452e-05, + "loss": 0.1591, + "step": 14619 + }, + { + "epoch": 0.23392, + "grad_norm": 0.734375, + "learning_rate": 7.722741935483872e-05, + "loss": 0.1625, + "step": 14620 + }, + { + "epoch": 0.233936, + "grad_norm": 2.3125, + "learning_rate": 7.722580645161291e-05, + "loss": 0.2065, + "step": 14621 + }, + { + "epoch": 0.233952, + "grad_norm": 0.6171875, + "learning_rate": 7.72241935483871e-05, + "loss": 0.1447, + "step": 14622 + }, + { + "epoch": 0.233968, + "grad_norm": 0.62890625, + "learning_rate": 7.722258064516129e-05, + "loss": 0.1697, + "step": 14623 + }, + { + "epoch": 0.233984, + "grad_norm": 0.93359375, + "learning_rate": 7.722096774193549e-05, + "loss": 0.1903, + "step": 14624 + }, + { + "epoch": 0.234, + "grad_norm": 0.69140625, + "learning_rate": 7.721935483870968e-05, + "loss": 0.2035, + "step": 14625 + }, + { + "epoch": 0.234016, + "grad_norm": 0.71484375, + "learning_rate": 7.721774193548388e-05, + "loss": 0.1608, + "step": 14626 + }, + { + "epoch": 0.234032, + "grad_norm": 1.0546875, + "learning_rate": 7.721612903225806e-05, + "loss": 0.172, + "step": 14627 + }, + { + "epoch": 0.234048, + "grad_norm": 0.83984375, + "learning_rate": 7.721451612903226e-05, + "loss": 0.1828, + "step": 14628 + }, + { + "epoch": 0.234064, + "grad_norm": 0.62109375, + "learning_rate": 7.721290322580645e-05, + "loss": 0.1308, + "step": 14629 + }, + { + "epoch": 0.23408, + "grad_norm": 0.953125, + "learning_rate": 7.721129032258065e-05, + "loss": 0.1568, + "step": 14630 + }, + { + "epoch": 0.234096, + "grad_norm": 0.85546875, + "learning_rate": 7.720967741935484e-05, + "loss": 0.1594, + "step": 14631 + }, + { + "epoch": 0.234112, + "grad_norm": 0.9296875, + "learning_rate": 7.720806451612904e-05, + "loss": 0.1614, + "step": 14632 + }, + { + "epoch": 0.234128, + "grad_norm": 0.703125, + "learning_rate": 7.720645161290323e-05, + "loss": 0.1897, + "step": 14633 + }, + { + "epoch": 0.234144, + "grad_norm": 1.296875, + "learning_rate": 7.720483870967742e-05, + "loss": 0.2202, + "step": 14634 + }, + { + "epoch": 0.23416, + "grad_norm": 0.69140625, + "learning_rate": 7.720322580645162e-05, + "loss": 0.1465, + "step": 14635 + }, + { + "epoch": 0.234176, + "grad_norm": 1.1484375, + "learning_rate": 7.72016129032258e-05, + "loss": 0.1508, + "step": 14636 + }, + { + "epoch": 0.234192, + "grad_norm": 0.70703125, + "learning_rate": 7.72e-05, + "loss": 0.1477, + "step": 14637 + }, + { + "epoch": 0.234208, + "grad_norm": 0.734375, + "learning_rate": 7.719838709677419e-05, + "loss": 0.1708, + "step": 14638 + }, + { + "epoch": 0.234224, + "grad_norm": 0.91796875, + "learning_rate": 7.719677419354839e-05, + "loss": 0.1479, + "step": 14639 + }, + { + "epoch": 0.23424, + "grad_norm": 0.640625, + "learning_rate": 7.719516129032258e-05, + "loss": 0.1528, + "step": 14640 + }, + { + "epoch": 0.234256, + "grad_norm": 0.95703125, + "learning_rate": 7.719354838709678e-05, + "loss": 0.1645, + "step": 14641 + }, + { + "epoch": 0.234272, + "grad_norm": 0.68359375, + "learning_rate": 7.719193548387096e-05, + "loss": 0.1589, + "step": 14642 + }, + { + "epoch": 0.234288, + "grad_norm": 0.6640625, + "learning_rate": 7.719032258064516e-05, + "loss": 0.1809, + "step": 14643 + }, + { + "epoch": 0.234304, + "grad_norm": 0.9609375, + "learning_rate": 7.718870967741936e-05, + "loss": 0.211, + "step": 14644 + }, + { + "epoch": 0.23432, + "grad_norm": 0.91015625, + "learning_rate": 7.718709677419356e-05, + "loss": 0.1461, + "step": 14645 + }, + { + "epoch": 0.234336, + "grad_norm": 0.890625, + "learning_rate": 7.718548387096775e-05, + "loss": 0.1793, + "step": 14646 + }, + { + "epoch": 0.234352, + "grad_norm": 1.5390625, + "learning_rate": 7.718387096774195e-05, + "loss": 0.1654, + "step": 14647 + }, + { + "epoch": 0.234368, + "grad_norm": 1.0859375, + "learning_rate": 7.718225806451613e-05, + "loss": 0.169, + "step": 14648 + }, + { + "epoch": 0.234384, + "grad_norm": 0.921875, + "learning_rate": 7.718064516129032e-05, + "loss": 0.2041, + "step": 14649 + }, + { + "epoch": 0.2344, + "grad_norm": 1.21875, + "learning_rate": 7.717903225806452e-05, + "loss": 0.1824, + "step": 14650 + }, + { + "epoch": 0.234416, + "grad_norm": 0.7421875, + "learning_rate": 7.71774193548387e-05, + "loss": 0.162, + "step": 14651 + }, + { + "epoch": 0.234432, + "grad_norm": 0.8359375, + "learning_rate": 7.71758064516129e-05, + "loss": 0.1876, + "step": 14652 + }, + { + "epoch": 0.234448, + "grad_norm": 0.92578125, + "learning_rate": 7.717419354838709e-05, + "loss": 0.1779, + "step": 14653 + }, + { + "epoch": 0.234464, + "grad_norm": 0.640625, + "learning_rate": 7.717258064516129e-05, + "loss": 0.1417, + "step": 14654 + }, + { + "epoch": 0.23448, + "grad_norm": 0.6953125, + "learning_rate": 7.717096774193549e-05, + "loss": 0.1771, + "step": 14655 + }, + { + "epoch": 0.234496, + "grad_norm": 0.58203125, + "learning_rate": 7.716935483870969e-05, + "loss": 0.2104, + "step": 14656 + }, + { + "epoch": 0.234512, + "grad_norm": 0.6875, + "learning_rate": 7.716774193548388e-05, + "loss": 0.2187, + "step": 14657 + }, + { + "epoch": 0.234528, + "grad_norm": 0.83203125, + "learning_rate": 7.716612903225808e-05, + "loss": 0.1902, + "step": 14658 + }, + { + "epoch": 0.234544, + "grad_norm": 0.64453125, + "learning_rate": 7.716451612903226e-05, + "loss": 0.1766, + "step": 14659 + }, + { + "epoch": 0.23456, + "grad_norm": 0.82421875, + "learning_rate": 7.716290322580646e-05, + "loss": 0.1991, + "step": 14660 + }, + { + "epoch": 0.234576, + "grad_norm": 0.7421875, + "learning_rate": 7.716129032258065e-05, + "loss": 0.1907, + "step": 14661 + }, + { + "epoch": 0.234592, + "grad_norm": 0.60546875, + "learning_rate": 7.715967741935485e-05, + "loss": 0.1773, + "step": 14662 + }, + { + "epoch": 0.234608, + "grad_norm": 0.953125, + "learning_rate": 7.715806451612903e-05, + "loss": 0.1509, + "step": 14663 + }, + { + "epoch": 0.234624, + "grad_norm": 0.8125, + "learning_rate": 7.715645161290322e-05, + "loss": 0.1627, + "step": 14664 + }, + { + "epoch": 0.23464, + "grad_norm": 0.734375, + "learning_rate": 7.715483870967742e-05, + "loss": 0.1772, + "step": 14665 + }, + { + "epoch": 0.234656, + "grad_norm": 0.8515625, + "learning_rate": 7.71532258064516e-05, + "loss": 0.1628, + "step": 14666 + }, + { + "epoch": 0.234672, + "grad_norm": 0.46875, + "learning_rate": 7.71516129032258e-05, + "loss": 0.1669, + "step": 14667 + }, + { + "epoch": 0.234688, + "grad_norm": 0.7265625, + "learning_rate": 7.715e-05, + "loss": 0.1793, + "step": 14668 + }, + { + "epoch": 0.234704, + "grad_norm": 0.63671875, + "learning_rate": 7.71483870967742e-05, + "loss": 0.17, + "step": 14669 + }, + { + "epoch": 0.23472, + "grad_norm": 0.80859375, + "learning_rate": 7.714677419354839e-05, + "loss": 0.1344, + "step": 14670 + }, + { + "epoch": 0.234736, + "grad_norm": 0.90234375, + "learning_rate": 7.714516129032259e-05, + "loss": 0.1913, + "step": 14671 + }, + { + "epoch": 0.234752, + "grad_norm": 0.56640625, + "learning_rate": 7.714354838709678e-05, + "loss": 0.1943, + "step": 14672 + }, + { + "epoch": 0.234768, + "grad_norm": 0.55078125, + "learning_rate": 7.714193548387098e-05, + "loss": 0.1736, + "step": 14673 + }, + { + "epoch": 0.234784, + "grad_norm": 1.0546875, + "learning_rate": 7.714032258064516e-05, + "loss": 0.1891, + "step": 14674 + }, + { + "epoch": 0.2348, + "grad_norm": 0.62890625, + "learning_rate": 7.713870967741936e-05, + "loss": 0.1728, + "step": 14675 + }, + { + "epoch": 0.234816, + "grad_norm": 0.7265625, + "learning_rate": 7.713709677419355e-05, + "loss": 0.1909, + "step": 14676 + }, + { + "epoch": 0.234832, + "grad_norm": 0.65234375, + "learning_rate": 7.713548387096775e-05, + "loss": 0.1348, + "step": 14677 + }, + { + "epoch": 0.234848, + "grad_norm": 1.0390625, + "learning_rate": 7.713387096774193e-05, + "loss": 0.1781, + "step": 14678 + }, + { + "epoch": 0.234864, + "grad_norm": 0.83984375, + "learning_rate": 7.713225806451613e-05, + "loss": 0.147, + "step": 14679 + }, + { + "epoch": 0.23488, + "grad_norm": 0.80859375, + "learning_rate": 7.713064516129033e-05, + "loss": 0.1912, + "step": 14680 + }, + { + "epoch": 0.234896, + "grad_norm": 0.86328125, + "learning_rate": 7.712903225806452e-05, + "loss": 0.1828, + "step": 14681 + }, + { + "epoch": 0.234912, + "grad_norm": 0.7890625, + "learning_rate": 7.712741935483872e-05, + "loss": 0.1836, + "step": 14682 + }, + { + "epoch": 0.234928, + "grad_norm": 0.9921875, + "learning_rate": 7.71258064516129e-05, + "loss": 0.1907, + "step": 14683 + }, + { + "epoch": 0.234944, + "grad_norm": 0.73828125, + "learning_rate": 7.71241935483871e-05, + "loss": 0.1763, + "step": 14684 + }, + { + "epoch": 0.23496, + "grad_norm": 0.90625, + "learning_rate": 7.712258064516129e-05, + "loss": 0.2226, + "step": 14685 + }, + { + "epoch": 0.234976, + "grad_norm": 1.046875, + "learning_rate": 7.712096774193549e-05, + "loss": 0.1258, + "step": 14686 + }, + { + "epoch": 0.234992, + "grad_norm": 0.7421875, + "learning_rate": 7.711935483870968e-05, + "loss": 0.1874, + "step": 14687 + }, + { + "epoch": 0.235008, + "grad_norm": 0.79296875, + "learning_rate": 7.711774193548388e-05, + "loss": 0.2055, + "step": 14688 + }, + { + "epoch": 0.235024, + "grad_norm": 0.93359375, + "learning_rate": 7.711612903225806e-05, + "loss": 0.1595, + "step": 14689 + }, + { + "epoch": 0.23504, + "grad_norm": 0.71484375, + "learning_rate": 7.711451612903226e-05, + "loss": 0.1446, + "step": 14690 + }, + { + "epoch": 0.235056, + "grad_norm": 1.0, + "learning_rate": 7.711290322580646e-05, + "loss": 0.1627, + "step": 14691 + }, + { + "epoch": 0.235072, + "grad_norm": 1.0703125, + "learning_rate": 7.711129032258065e-05, + "loss": 0.1803, + "step": 14692 + }, + { + "epoch": 0.235088, + "grad_norm": 0.80859375, + "learning_rate": 7.710967741935485e-05, + "loss": 0.1417, + "step": 14693 + }, + { + "epoch": 0.235104, + "grad_norm": 0.890625, + "learning_rate": 7.710806451612903e-05, + "loss": 0.1533, + "step": 14694 + }, + { + "epoch": 0.23512, + "grad_norm": 0.71484375, + "learning_rate": 7.710645161290323e-05, + "loss": 0.1508, + "step": 14695 + }, + { + "epoch": 0.235136, + "grad_norm": 0.484375, + "learning_rate": 7.710483870967742e-05, + "loss": 0.1818, + "step": 14696 + }, + { + "epoch": 0.235152, + "grad_norm": 0.79296875, + "learning_rate": 7.710322580645162e-05, + "loss": 0.1595, + "step": 14697 + }, + { + "epoch": 0.235168, + "grad_norm": 0.90234375, + "learning_rate": 7.71016129032258e-05, + "loss": 0.1546, + "step": 14698 + }, + { + "epoch": 0.235184, + "grad_norm": 1.0625, + "learning_rate": 7.71e-05, + "loss": 0.1695, + "step": 14699 + }, + { + "epoch": 0.2352, + "grad_norm": 0.84765625, + "learning_rate": 7.709838709677419e-05, + "loss": 0.2073, + "step": 14700 + }, + { + "epoch": 0.235216, + "grad_norm": 0.8359375, + "learning_rate": 7.709677419354839e-05, + "loss": 0.1876, + "step": 14701 + }, + { + "epoch": 0.235232, + "grad_norm": 0.6953125, + "learning_rate": 7.709516129032258e-05, + "loss": 0.164, + "step": 14702 + }, + { + "epoch": 0.235248, + "grad_norm": 0.9765625, + "learning_rate": 7.709354838709678e-05, + "loss": 0.1719, + "step": 14703 + }, + { + "epoch": 0.235264, + "grad_norm": 0.7890625, + "learning_rate": 7.709193548387097e-05, + "loss": 0.1635, + "step": 14704 + }, + { + "epoch": 0.23528, + "grad_norm": 1.1875, + "learning_rate": 7.709032258064517e-05, + "loss": 0.1699, + "step": 14705 + }, + { + "epoch": 0.235296, + "grad_norm": 0.66015625, + "learning_rate": 7.708870967741936e-05, + "loss": 0.1756, + "step": 14706 + }, + { + "epoch": 0.235312, + "grad_norm": 0.53125, + "learning_rate": 7.708709677419356e-05, + "loss": 0.1559, + "step": 14707 + }, + { + "epoch": 0.235328, + "grad_norm": 1.0078125, + "learning_rate": 7.708548387096775e-05, + "loss": 0.2052, + "step": 14708 + }, + { + "epoch": 0.235344, + "grad_norm": 0.84765625, + "learning_rate": 7.708387096774195e-05, + "loss": 0.1757, + "step": 14709 + }, + { + "epoch": 0.23536, + "grad_norm": 0.828125, + "learning_rate": 7.708225806451613e-05, + "loss": 0.1693, + "step": 14710 + }, + { + "epoch": 0.235376, + "grad_norm": 0.53515625, + "learning_rate": 7.708064516129032e-05, + "loss": 0.1391, + "step": 14711 + }, + { + "epoch": 0.235392, + "grad_norm": 0.61328125, + "learning_rate": 7.707903225806452e-05, + "loss": 0.1721, + "step": 14712 + }, + { + "epoch": 0.235408, + "grad_norm": 1.296875, + "learning_rate": 7.70774193548387e-05, + "loss": 0.1735, + "step": 14713 + }, + { + "epoch": 0.235424, + "grad_norm": 1.234375, + "learning_rate": 7.70758064516129e-05, + "loss": 0.1684, + "step": 14714 + }, + { + "epoch": 0.23544, + "grad_norm": 1.0234375, + "learning_rate": 7.70741935483871e-05, + "loss": 0.1554, + "step": 14715 + }, + { + "epoch": 0.235456, + "grad_norm": 1.0234375, + "learning_rate": 7.70725806451613e-05, + "loss": 0.2033, + "step": 14716 + }, + { + "epoch": 0.235472, + "grad_norm": 0.55859375, + "learning_rate": 7.707096774193549e-05, + "loss": 0.1515, + "step": 14717 + }, + { + "epoch": 0.235488, + "grad_norm": 0.78125, + "learning_rate": 7.706935483870969e-05, + "loss": 0.1648, + "step": 14718 + }, + { + "epoch": 0.235504, + "grad_norm": 1.1015625, + "learning_rate": 7.706774193548387e-05, + "loss": 0.1385, + "step": 14719 + }, + { + "epoch": 0.23552, + "grad_norm": 0.73046875, + "learning_rate": 7.706612903225807e-05, + "loss": 0.1809, + "step": 14720 + }, + { + "epoch": 0.235536, + "grad_norm": 0.462890625, + "learning_rate": 7.706451612903226e-05, + "loss": 0.1535, + "step": 14721 + }, + { + "epoch": 0.235552, + "grad_norm": 0.59375, + "learning_rate": 7.706290322580646e-05, + "loss": 0.1474, + "step": 14722 + }, + { + "epoch": 0.235568, + "grad_norm": 1.3828125, + "learning_rate": 7.706129032258065e-05, + "loss": 0.2057, + "step": 14723 + }, + { + "epoch": 0.235584, + "grad_norm": 0.796875, + "learning_rate": 7.705967741935485e-05, + "loss": 0.1887, + "step": 14724 + }, + { + "epoch": 0.2356, + "grad_norm": 0.84765625, + "learning_rate": 7.705806451612903e-05, + "loss": 0.1889, + "step": 14725 + }, + { + "epoch": 0.235616, + "grad_norm": 0.6328125, + "learning_rate": 7.705645161290322e-05, + "loss": 0.1697, + "step": 14726 + }, + { + "epoch": 0.235632, + "grad_norm": 1.265625, + "learning_rate": 7.705483870967742e-05, + "loss": 0.2175, + "step": 14727 + }, + { + "epoch": 0.235648, + "grad_norm": 0.84765625, + "learning_rate": 7.705322580645162e-05, + "loss": 0.1848, + "step": 14728 + }, + { + "epoch": 0.235664, + "grad_norm": 0.6171875, + "learning_rate": 7.705161290322582e-05, + "loss": 0.1813, + "step": 14729 + }, + { + "epoch": 0.23568, + "grad_norm": 0.75, + "learning_rate": 7.705e-05, + "loss": 0.1894, + "step": 14730 + }, + { + "epoch": 0.235696, + "grad_norm": 0.7890625, + "learning_rate": 7.70483870967742e-05, + "loss": 0.1982, + "step": 14731 + }, + { + "epoch": 0.235712, + "grad_norm": 0.75, + "learning_rate": 7.704677419354839e-05, + "loss": 0.1871, + "step": 14732 + }, + { + "epoch": 0.235728, + "grad_norm": 0.5703125, + "learning_rate": 7.704516129032259e-05, + "loss": 0.1384, + "step": 14733 + }, + { + "epoch": 0.235744, + "grad_norm": 0.671875, + "learning_rate": 7.704354838709677e-05, + "loss": 0.1712, + "step": 14734 + }, + { + "epoch": 0.23576, + "grad_norm": 0.515625, + "learning_rate": 7.704193548387097e-05, + "loss": 0.1505, + "step": 14735 + }, + { + "epoch": 0.235776, + "grad_norm": 0.9375, + "learning_rate": 7.704032258064516e-05, + "loss": 0.2143, + "step": 14736 + }, + { + "epoch": 0.235792, + "grad_norm": 0.83203125, + "learning_rate": 7.703870967741936e-05, + "loss": 0.1759, + "step": 14737 + }, + { + "epoch": 0.235808, + "grad_norm": 0.80859375, + "learning_rate": 7.703709677419355e-05, + "loss": 0.1659, + "step": 14738 + }, + { + "epoch": 0.235824, + "grad_norm": 0.76171875, + "learning_rate": 7.703548387096775e-05, + "loss": 0.1855, + "step": 14739 + }, + { + "epoch": 0.23584, + "grad_norm": 0.490234375, + "learning_rate": 7.703387096774194e-05, + "loss": 0.1665, + "step": 14740 + }, + { + "epoch": 0.235856, + "grad_norm": 0.765625, + "learning_rate": 7.703225806451613e-05, + "loss": 0.1864, + "step": 14741 + }, + { + "epoch": 0.235872, + "grad_norm": 1.2265625, + "learning_rate": 7.703064516129033e-05, + "loss": 0.1848, + "step": 14742 + }, + { + "epoch": 0.235888, + "grad_norm": 0.51171875, + "learning_rate": 7.702903225806452e-05, + "loss": 0.1618, + "step": 14743 + }, + { + "epoch": 0.235904, + "grad_norm": 0.94140625, + "learning_rate": 7.702741935483872e-05, + "loss": 0.1868, + "step": 14744 + }, + { + "epoch": 0.23592, + "grad_norm": 0.66015625, + "learning_rate": 7.70258064516129e-05, + "loss": 0.176, + "step": 14745 + }, + { + "epoch": 0.235936, + "grad_norm": 1.2421875, + "learning_rate": 7.70241935483871e-05, + "loss": 0.1566, + "step": 14746 + }, + { + "epoch": 0.235952, + "grad_norm": 0.703125, + "learning_rate": 7.702258064516129e-05, + "loss": 0.1675, + "step": 14747 + }, + { + "epoch": 0.235968, + "grad_norm": 0.57421875, + "learning_rate": 7.702096774193549e-05, + "loss": 0.2197, + "step": 14748 + }, + { + "epoch": 0.235984, + "grad_norm": 0.640625, + "learning_rate": 7.701935483870967e-05, + "loss": 0.1695, + "step": 14749 + }, + { + "epoch": 0.236, + "grad_norm": 0.53125, + "learning_rate": 7.701774193548387e-05, + "loss": 0.1549, + "step": 14750 + }, + { + "epoch": 0.236016, + "grad_norm": 0.7421875, + "learning_rate": 7.701612903225807e-05, + "loss": 0.1902, + "step": 14751 + }, + { + "epoch": 0.236032, + "grad_norm": 1.03125, + "learning_rate": 7.701451612903227e-05, + "loss": 0.1828, + "step": 14752 + }, + { + "epoch": 0.236048, + "grad_norm": 1.203125, + "learning_rate": 7.701290322580646e-05, + "loss": 0.1586, + "step": 14753 + }, + { + "epoch": 0.236064, + "grad_norm": 0.6953125, + "learning_rate": 7.701129032258066e-05, + "loss": 0.1627, + "step": 14754 + }, + { + "epoch": 0.23608, + "grad_norm": 0.62109375, + "learning_rate": 7.700967741935484e-05, + "loss": 0.1564, + "step": 14755 + }, + { + "epoch": 0.236096, + "grad_norm": 0.6796875, + "learning_rate": 7.700806451612904e-05, + "loss": 0.1705, + "step": 14756 + }, + { + "epoch": 0.236112, + "grad_norm": 0.65234375, + "learning_rate": 7.700645161290323e-05, + "loss": 0.1701, + "step": 14757 + }, + { + "epoch": 0.236128, + "grad_norm": 1.8671875, + "learning_rate": 7.700483870967742e-05, + "loss": 0.1633, + "step": 14758 + }, + { + "epoch": 0.236144, + "grad_norm": 0.7578125, + "learning_rate": 7.700322580645162e-05, + "loss": 0.1997, + "step": 14759 + }, + { + "epoch": 0.23616, + "grad_norm": 0.5703125, + "learning_rate": 7.70016129032258e-05, + "loss": 0.1816, + "step": 14760 + }, + { + "epoch": 0.236176, + "grad_norm": 0.92578125, + "learning_rate": 7.7e-05, + "loss": 0.1662, + "step": 14761 + }, + { + "epoch": 0.236192, + "grad_norm": 0.9453125, + "learning_rate": 7.699838709677419e-05, + "loss": 0.154, + "step": 14762 + }, + { + "epoch": 0.236208, + "grad_norm": 0.83984375, + "learning_rate": 7.699677419354839e-05, + "loss": 0.1564, + "step": 14763 + }, + { + "epoch": 0.236224, + "grad_norm": 0.53515625, + "learning_rate": 7.699516129032259e-05, + "loss": 0.1876, + "step": 14764 + }, + { + "epoch": 0.23624, + "grad_norm": 0.9375, + "learning_rate": 7.699354838709679e-05, + "loss": 0.1602, + "step": 14765 + }, + { + "epoch": 0.236256, + "grad_norm": 0.60546875, + "learning_rate": 7.699193548387097e-05, + "loss": 0.1628, + "step": 14766 + }, + { + "epoch": 0.236272, + "grad_norm": 0.60546875, + "learning_rate": 7.699032258064517e-05, + "loss": 0.1659, + "step": 14767 + }, + { + "epoch": 0.236288, + "grad_norm": 0.80078125, + "learning_rate": 7.698870967741936e-05, + "loss": 0.1948, + "step": 14768 + }, + { + "epoch": 0.236304, + "grad_norm": 1.2265625, + "learning_rate": 7.698709677419356e-05, + "loss": 0.162, + "step": 14769 + }, + { + "epoch": 0.23632, + "grad_norm": 0.703125, + "learning_rate": 7.698548387096774e-05, + "loss": 0.219, + "step": 14770 + }, + { + "epoch": 0.236336, + "grad_norm": 1.1640625, + "learning_rate": 7.698387096774194e-05, + "loss": 0.1707, + "step": 14771 + }, + { + "epoch": 0.236352, + "grad_norm": 0.80859375, + "learning_rate": 7.698225806451613e-05, + "loss": 0.1905, + "step": 14772 + }, + { + "epoch": 0.236368, + "grad_norm": 0.98828125, + "learning_rate": 7.698064516129032e-05, + "loss": 0.1869, + "step": 14773 + }, + { + "epoch": 0.236384, + "grad_norm": 0.75, + "learning_rate": 7.697903225806452e-05, + "loss": 0.1595, + "step": 14774 + }, + { + "epoch": 0.2364, + "grad_norm": 0.90234375, + "learning_rate": 7.697741935483871e-05, + "loss": 0.1765, + "step": 14775 + }, + { + "epoch": 0.236416, + "grad_norm": 0.75390625, + "learning_rate": 7.697580645161291e-05, + "loss": 0.1298, + "step": 14776 + }, + { + "epoch": 0.236432, + "grad_norm": 0.94140625, + "learning_rate": 7.69741935483871e-05, + "loss": 0.1925, + "step": 14777 + }, + { + "epoch": 0.236448, + "grad_norm": 1.15625, + "learning_rate": 7.69725806451613e-05, + "loss": 0.1644, + "step": 14778 + }, + { + "epoch": 0.236464, + "grad_norm": 0.78125, + "learning_rate": 7.697096774193549e-05, + "loss": 0.1728, + "step": 14779 + }, + { + "epoch": 0.23648, + "grad_norm": 1.2265625, + "learning_rate": 7.696935483870969e-05, + "loss": 0.1497, + "step": 14780 + }, + { + "epoch": 0.236496, + "grad_norm": 0.58984375, + "learning_rate": 7.696774193548387e-05, + "loss": 0.1492, + "step": 14781 + }, + { + "epoch": 0.236512, + "grad_norm": 1.1484375, + "learning_rate": 7.696612903225807e-05, + "loss": 0.1895, + "step": 14782 + }, + { + "epoch": 0.236528, + "grad_norm": 0.72265625, + "learning_rate": 7.696451612903226e-05, + "loss": 0.1605, + "step": 14783 + }, + { + "epoch": 0.236544, + "grad_norm": 0.8359375, + "learning_rate": 7.696290322580646e-05, + "loss": 0.1898, + "step": 14784 + }, + { + "epoch": 0.23656, + "grad_norm": 0.77734375, + "learning_rate": 7.696129032258064e-05, + "loss": 0.1498, + "step": 14785 + }, + { + "epoch": 0.236576, + "grad_norm": 0.7265625, + "learning_rate": 7.695967741935484e-05, + "loss": 0.1509, + "step": 14786 + }, + { + "epoch": 0.236592, + "grad_norm": 0.859375, + "learning_rate": 7.695806451612903e-05, + "loss": 0.1988, + "step": 14787 + }, + { + "epoch": 0.236608, + "grad_norm": 0.6953125, + "learning_rate": 7.695645161290323e-05, + "loss": 0.1858, + "step": 14788 + }, + { + "epoch": 0.236624, + "grad_norm": 1.03125, + "learning_rate": 7.695483870967743e-05, + "loss": 0.176, + "step": 14789 + }, + { + "epoch": 0.23664, + "grad_norm": 0.69921875, + "learning_rate": 7.695322580645161e-05, + "loss": 0.1528, + "step": 14790 + }, + { + "epoch": 0.236656, + "grad_norm": 0.73828125, + "learning_rate": 7.695161290322581e-05, + "loss": 0.1809, + "step": 14791 + }, + { + "epoch": 0.236672, + "grad_norm": 1.359375, + "learning_rate": 7.695e-05, + "loss": 0.1917, + "step": 14792 + }, + { + "epoch": 0.236688, + "grad_norm": 0.5703125, + "learning_rate": 7.69483870967742e-05, + "loss": 0.1333, + "step": 14793 + }, + { + "epoch": 0.236704, + "grad_norm": 0.84375, + "learning_rate": 7.694677419354839e-05, + "loss": 0.181, + "step": 14794 + }, + { + "epoch": 0.23672, + "grad_norm": 0.734375, + "learning_rate": 7.694516129032259e-05, + "loss": 0.1675, + "step": 14795 + }, + { + "epoch": 0.236736, + "grad_norm": 0.78515625, + "learning_rate": 7.694354838709677e-05, + "loss": 0.1735, + "step": 14796 + }, + { + "epoch": 0.236752, + "grad_norm": 0.8984375, + "learning_rate": 7.694193548387097e-05, + "loss": 0.1625, + "step": 14797 + }, + { + "epoch": 0.236768, + "grad_norm": 1.140625, + "learning_rate": 7.694032258064516e-05, + "loss": 0.2483, + "step": 14798 + }, + { + "epoch": 0.236784, + "grad_norm": 0.65234375, + "learning_rate": 7.693870967741936e-05, + "loss": 0.1423, + "step": 14799 + }, + { + "epoch": 0.2368, + "grad_norm": 0.75, + "learning_rate": 7.693709677419356e-05, + "loss": 0.1822, + "step": 14800 + }, + { + "epoch": 0.236816, + "grad_norm": 0.57421875, + "learning_rate": 7.693548387096776e-05, + "loss": 0.1538, + "step": 14801 + }, + { + "epoch": 0.236832, + "grad_norm": 0.75, + "learning_rate": 7.693387096774194e-05, + "loss": 0.1928, + "step": 14802 + }, + { + "epoch": 0.236848, + "grad_norm": 0.73828125, + "learning_rate": 7.693225806451613e-05, + "loss": 0.1987, + "step": 14803 + }, + { + "epoch": 0.236864, + "grad_norm": 0.66015625, + "learning_rate": 7.693064516129033e-05, + "loss": 0.1815, + "step": 14804 + }, + { + "epoch": 0.23688, + "grad_norm": 0.78125, + "learning_rate": 7.692903225806451e-05, + "loss": 0.1635, + "step": 14805 + }, + { + "epoch": 0.236896, + "grad_norm": 1.7421875, + "learning_rate": 7.692741935483871e-05, + "loss": 0.1837, + "step": 14806 + }, + { + "epoch": 0.236912, + "grad_norm": 0.84765625, + "learning_rate": 7.69258064516129e-05, + "loss": 0.1633, + "step": 14807 + }, + { + "epoch": 0.236928, + "grad_norm": 1.046875, + "learning_rate": 7.69241935483871e-05, + "loss": 0.1931, + "step": 14808 + }, + { + "epoch": 0.236944, + "grad_norm": 0.59375, + "learning_rate": 7.692258064516129e-05, + "loss": 0.1486, + "step": 14809 + }, + { + "epoch": 0.23696, + "grad_norm": 0.76171875, + "learning_rate": 7.692096774193549e-05, + "loss": 0.2121, + "step": 14810 + }, + { + "epoch": 0.236976, + "grad_norm": 0.75, + "learning_rate": 7.691935483870968e-05, + "loss": 0.1883, + "step": 14811 + }, + { + "epoch": 0.236992, + "grad_norm": 0.875, + "learning_rate": 7.691774193548388e-05, + "loss": 0.163, + "step": 14812 + }, + { + "epoch": 0.237008, + "grad_norm": 0.79296875, + "learning_rate": 7.691612903225807e-05, + "loss": 0.1864, + "step": 14813 + }, + { + "epoch": 0.237024, + "grad_norm": 0.98046875, + "learning_rate": 7.691451612903227e-05, + "loss": 0.1749, + "step": 14814 + }, + { + "epoch": 0.23704, + "grad_norm": 0.82421875, + "learning_rate": 7.691290322580646e-05, + "loss": 0.1479, + "step": 14815 + }, + { + "epoch": 0.237056, + "grad_norm": 0.78515625, + "learning_rate": 7.691129032258066e-05, + "loss": 0.2004, + "step": 14816 + }, + { + "epoch": 0.237072, + "grad_norm": 0.8984375, + "learning_rate": 7.690967741935484e-05, + "loss": 0.1919, + "step": 14817 + }, + { + "epoch": 0.237088, + "grad_norm": 0.75390625, + "learning_rate": 7.690806451612904e-05, + "loss": 0.2015, + "step": 14818 + }, + { + "epoch": 0.237104, + "grad_norm": 0.87109375, + "learning_rate": 7.690645161290323e-05, + "loss": 0.1721, + "step": 14819 + }, + { + "epoch": 0.23712, + "grad_norm": 0.91015625, + "learning_rate": 7.690483870967741e-05, + "loss": 0.1445, + "step": 14820 + }, + { + "epoch": 0.237136, + "grad_norm": 0.87109375, + "learning_rate": 7.690322580645161e-05, + "loss": 0.1612, + "step": 14821 + }, + { + "epoch": 0.237152, + "grad_norm": 0.95703125, + "learning_rate": 7.69016129032258e-05, + "loss": 0.2096, + "step": 14822 + }, + { + "epoch": 0.237168, + "grad_norm": 0.64453125, + "learning_rate": 7.69e-05, + "loss": 0.1314, + "step": 14823 + }, + { + "epoch": 0.237184, + "grad_norm": 0.62109375, + "learning_rate": 7.68983870967742e-05, + "loss": 0.1292, + "step": 14824 + }, + { + "epoch": 0.2372, + "grad_norm": 0.96875, + "learning_rate": 7.68967741935484e-05, + "loss": 0.1682, + "step": 14825 + }, + { + "epoch": 0.237216, + "grad_norm": 0.765625, + "learning_rate": 7.689516129032258e-05, + "loss": 0.1977, + "step": 14826 + }, + { + "epoch": 0.237232, + "grad_norm": 1.015625, + "learning_rate": 7.689354838709678e-05, + "loss": 0.2006, + "step": 14827 + }, + { + "epoch": 0.237248, + "grad_norm": 0.8984375, + "learning_rate": 7.689193548387097e-05, + "loss": 0.2095, + "step": 14828 + }, + { + "epoch": 0.237264, + "grad_norm": 0.7265625, + "learning_rate": 7.689032258064517e-05, + "loss": 0.1672, + "step": 14829 + }, + { + "epoch": 0.23728, + "grad_norm": 0.84765625, + "learning_rate": 7.688870967741936e-05, + "loss": 0.2265, + "step": 14830 + }, + { + "epoch": 0.237296, + "grad_norm": 0.92578125, + "learning_rate": 7.688709677419356e-05, + "loss": 0.1714, + "step": 14831 + }, + { + "epoch": 0.237312, + "grad_norm": 0.6953125, + "learning_rate": 7.688548387096774e-05, + "loss": 0.1558, + "step": 14832 + }, + { + "epoch": 0.237328, + "grad_norm": 0.90234375, + "learning_rate": 7.688387096774194e-05, + "loss": 0.1621, + "step": 14833 + }, + { + "epoch": 0.237344, + "grad_norm": 0.87109375, + "learning_rate": 7.688225806451613e-05, + "loss": 0.143, + "step": 14834 + }, + { + "epoch": 0.23736, + "grad_norm": 0.7578125, + "learning_rate": 7.688064516129033e-05, + "loss": 0.216, + "step": 14835 + }, + { + "epoch": 0.237376, + "grad_norm": 0.74609375, + "learning_rate": 7.687903225806453e-05, + "loss": 0.1668, + "step": 14836 + }, + { + "epoch": 0.237392, + "grad_norm": 0.74609375, + "learning_rate": 7.687741935483871e-05, + "loss": 0.1932, + "step": 14837 + }, + { + "epoch": 0.237408, + "grad_norm": 0.72265625, + "learning_rate": 7.687580645161291e-05, + "loss": 0.1196, + "step": 14838 + }, + { + "epoch": 0.237424, + "grad_norm": 1.0078125, + "learning_rate": 7.68741935483871e-05, + "loss": 0.2015, + "step": 14839 + }, + { + "epoch": 0.23744, + "grad_norm": 0.5859375, + "learning_rate": 7.68725806451613e-05, + "loss": 0.1805, + "step": 14840 + }, + { + "epoch": 0.237456, + "grad_norm": 0.75390625, + "learning_rate": 7.687096774193548e-05, + "loss": 0.1669, + "step": 14841 + }, + { + "epoch": 0.237472, + "grad_norm": 0.796875, + "learning_rate": 7.686935483870968e-05, + "loss": 0.1539, + "step": 14842 + }, + { + "epoch": 0.237488, + "grad_norm": 0.890625, + "learning_rate": 7.686774193548387e-05, + "loss": 0.147, + "step": 14843 + }, + { + "epoch": 0.237504, + "grad_norm": 0.8359375, + "learning_rate": 7.686612903225807e-05, + "loss": 0.152, + "step": 14844 + }, + { + "epoch": 0.23752, + "grad_norm": 0.703125, + "learning_rate": 7.686451612903226e-05, + "loss": 0.1678, + "step": 14845 + }, + { + "epoch": 0.237536, + "grad_norm": 0.70703125, + "learning_rate": 7.686290322580645e-05, + "loss": 0.1557, + "step": 14846 + }, + { + "epoch": 0.237552, + "grad_norm": 0.56640625, + "learning_rate": 7.686129032258065e-05, + "loss": 0.1697, + "step": 14847 + }, + { + "epoch": 0.237568, + "grad_norm": 0.6796875, + "learning_rate": 7.685967741935485e-05, + "loss": 0.1683, + "step": 14848 + }, + { + "epoch": 0.237584, + "grad_norm": 0.7109375, + "learning_rate": 7.685806451612904e-05, + "loss": 0.1742, + "step": 14849 + }, + { + "epoch": 0.2376, + "grad_norm": 0.55078125, + "learning_rate": 7.685645161290323e-05, + "loss": 0.1357, + "step": 14850 + }, + { + "epoch": 0.237616, + "grad_norm": 0.7734375, + "learning_rate": 7.685483870967743e-05, + "loss": 0.16, + "step": 14851 + }, + { + "epoch": 0.237632, + "grad_norm": 0.59375, + "learning_rate": 7.685322580645161e-05, + "loss": 0.1442, + "step": 14852 + }, + { + "epoch": 0.237648, + "grad_norm": 0.74609375, + "learning_rate": 7.685161290322581e-05, + "loss": 0.1637, + "step": 14853 + }, + { + "epoch": 0.237664, + "grad_norm": 0.59375, + "learning_rate": 7.685e-05, + "loss": 0.1722, + "step": 14854 + }, + { + "epoch": 0.23768, + "grad_norm": 0.59765625, + "learning_rate": 7.68483870967742e-05, + "loss": 0.1708, + "step": 14855 + }, + { + "epoch": 0.237696, + "grad_norm": 0.796875, + "learning_rate": 7.684677419354838e-05, + "loss": 0.1722, + "step": 14856 + }, + { + "epoch": 0.237712, + "grad_norm": 1.0, + "learning_rate": 7.684516129032258e-05, + "loss": 0.1976, + "step": 14857 + }, + { + "epoch": 0.237728, + "grad_norm": 1.4453125, + "learning_rate": 7.684354838709677e-05, + "loss": 0.1795, + "step": 14858 + }, + { + "epoch": 0.237744, + "grad_norm": 0.79296875, + "learning_rate": 7.684193548387097e-05, + "loss": 0.1888, + "step": 14859 + }, + { + "epoch": 0.23776, + "grad_norm": 0.6171875, + "learning_rate": 7.684032258064517e-05, + "loss": 0.1499, + "step": 14860 + }, + { + "epoch": 0.237776, + "grad_norm": 0.890625, + "learning_rate": 7.683870967741937e-05, + "loss": 0.1765, + "step": 14861 + }, + { + "epoch": 0.237792, + "grad_norm": 0.94140625, + "learning_rate": 7.683709677419355e-05, + "loss": 0.155, + "step": 14862 + }, + { + "epoch": 0.237808, + "grad_norm": 1.2734375, + "learning_rate": 7.683548387096775e-05, + "loss": 0.2298, + "step": 14863 + }, + { + "epoch": 0.237824, + "grad_norm": 0.7265625, + "learning_rate": 7.683387096774194e-05, + "loss": 0.1694, + "step": 14864 + }, + { + "epoch": 0.23784, + "grad_norm": 1.0, + "learning_rate": 7.683225806451613e-05, + "loss": 0.1654, + "step": 14865 + }, + { + "epoch": 0.237856, + "grad_norm": 0.80078125, + "learning_rate": 7.683064516129033e-05, + "loss": 0.2177, + "step": 14866 + }, + { + "epoch": 0.237872, + "grad_norm": 0.4453125, + "learning_rate": 7.682903225806451e-05, + "loss": 0.1297, + "step": 14867 + }, + { + "epoch": 0.237888, + "grad_norm": 0.93359375, + "learning_rate": 7.682741935483871e-05, + "loss": 0.2323, + "step": 14868 + }, + { + "epoch": 0.237904, + "grad_norm": 1.078125, + "learning_rate": 7.68258064516129e-05, + "loss": 0.202, + "step": 14869 + }, + { + "epoch": 0.23792, + "grad_norm": 0.78515625, + "learning_rate": 7.68241935483871e-05, + "loss": 0.2022, + "step": 14870 + }, + { + "epoch": 0.237936, + "grad_norm": 0.8828125, + "learning_rate": 7.68225806451613e-05, + "loss": 0.2234, + "step": 14871 + }, + { + "epoch": 0.237952, + "grad_norm": 1.28125, + "learning_rate": 7.68209677419355e-05, + "loss": 0.1678, + "step": 14872 + }, + { + "epoch": 0.237968, + "grad_norm": 1.0, + "learning_rate": 7.681935483870968e-05, + "loss": 0.2161, + "step": 14873 + }, + { + "epoch": 0.237984, + "grad_norm": 1.0625, + "learning_rate": 7.681774193548388e-05, + "loss": 0.186, + "step": 14874 + }, + { + "epoch": 0.238, + "grad_norm": 0.65625, + "learning_rate": 7.681612903225807e-05, + "loss": 0.1406, + "step": 14875 + }, + { + "epoch": 0.238016, + "grad_norm": 0.89453125, + "learning_rate": 7.681451612903227e-05, + "loss": 0.1958, + "step": 14876 + }, + { + "epoch": 0.238032, + "grad_norm": 0.71875, + "learning_rate": 7.681290322580645e-05, + "loss": 0.1776, + "step": 14877 + }, + { + "epoch": 0.238048, + "grad_norm": 1.078125, + "learning_rate": 7.681129032258065e-05, + "loss": 0.1848, + "step": 14878 + }, + { + "epoch": 0.238064, + "grad_norm": 0.9296875, + "learning_rate": 7.680967741935484e-05, + "loss": 0.1438, + "step": 14879 + }, + { + "epoch": 0.23808, + "grad_norm": 1.328125, + "learning_rate": 7.680806451612904e-05, + "loss": 0.157, + "step": 14880 + }, + { + "epoch": 0.238096, + "grad_norm": 0.73046875, + "learning_rate": 7.680645161290323e-05, + "loss": 0.1724, + "step": 14881 + }, + { + "epoch": 0.238112, + "grad_norm": 0.80078125, + "learning_rate": 7.680483870967742e-05, + "loss": 0.1775, + "step": 14882 + }, + { + "epoch": 0.238128, + "grad_norm": 0.91015625, + "learning_rate": 7.680322580645161e-05, + "loss": 0.2174, + "step": 14883 + }, + { + "epoch": 0.238144, + "grad_norm": 0.828125, + "learning_rate": 7.680161290322581e-05, + "loss": 0.1553, + "step": 14884 + }, + { + "epoch": 0.23816, + "grad_norm": 0.65234375, + "learning_rate": 7.680000000000001e-05, + "loss": 0.1651, + "step": 14885 + }, + { + "epoch": 0.238176, + "grad_norm": 0.7734375, + "learning_rate": 7.67983870967742e-05, + "loss": 0.2167, + "step": 14886 + }, + { + "epoch": 0.238192, + "grad_norm": 0.6953125, + "learning_rate": 7.67967741935484e-05, + "loss": 0.1669, + "step": 14887 + }, + { + "epoch": 0.238208, + "grad_norm": 0.9296875, + "learning_rate": 7.679516129032258e-05, + "loss": 0.2037, + "step": 14888 + }, + { + "epoch": 0.238224, + "grad_norm": 0.66796875, + "learning_rate": 7.679354838709678e-05, + "loss": 0.1729, + "step": 14889 + }, + { + "epoch": 0.23824, + "grad_norm": 1.2890625, + "learning_rate": 7.679193548387097e-05, + "loss": 0.1914, + "step": 14890 + }, + { + "epoch": 0.238256, + "grad_norm": 0.98046875, + "learning_rate": 7.679032258064517e-05, + "loss": 0.1706, + "step": 14891 + }, + { + "epoch": 0.238272, + "grad_norm": 0.7109375, + "learning_rate": 7.678870967741935e-05, + "loss": 0.1972, + "step": 14892 + }, + { + "epoch": 0.238288, + "grad_norm": 0.66015625, + "learning_rate": 7.678709677419355e-05, + "loss": 0.1646, + "step": 14893 + }, + { + "epoch": 0.238304, + "grad_norm": 0.9609375, + "learning_rate": 7.678548387096774e-05, + "loss": 0.1781, + "step": 14894 + }, + { + "epoch": 0.23832, + "grad_norm": 1.0859375, + "learning_rate": 7.678387096774194e-05, + "loss": 0.1567, + "step": 14895 + }, + { + "epoch": 0.238336, + "grad_norm": 0.78515625, + "learning_rate": 7.678225806451614e-05, + "loss": 0.1776, + "step": 14896 + }, + { + "epoch": 0.238352, + "grad_norm": 0.5625, + "learning_rate": 7.678064516129032e-05, + "loss": 0.1928, + "step": 14897 + }, + { + "epoch": 0.238368, + "grad_norm": 0.87890625, + "learning_rate": 7.677903225806452e-05, + "loss": 0.2126, + "step": 14898 + }, + { + "epoch": 0.238384, + "grad_norm": 0.8046875, + "learning_rate": 7.677741935483871e-05, + "loss": 0.1798, + "step": 14899 + }, + { + "epoch": 0.2384, + "grad_norm": 1.2109375, + "learning_rate": 7.677580645161291e-05, + "loss": 0.2121, + "step": 14900 + }, + { + "epoch": 0.238416, + "grad_norm": 0.640625, + "learning_rate": 7.67741935483871e-05, + "loss": 0.1772, + "step": 14901 + }, + { + "epoch": 0.238432, + "grad_norm": 0.9375, + "learning_rate": 7.67725806451613e-05, + "loss": 0.169, + "step": 14902 + }, + { + "epoch": 0.238448, + "grad_norm": 0.84765625, + "learning_rate": 7.677096774193548e-05, + "loss": 0.1873, + "step": 14903 + }, + { + "epoch": 0.238464, + "grad_norm": 0.9140625, + "learning_rate": 7.676935483870968e-05, + "loss": 0.1544, + "step": 14904 + }, + { + "epoch": 0.23848, + "grad_norm": 0.59765625, + "learning_rate": 7.676774193548387e-05, + "loss": 0.176, + "step": 14905 + }, + { + "epoch": 0.238496, + "grad_norm": 0.64453125, + "learning_rate": 7.676612903225807e-05, + "loss": 0.1357, + "step": 14906 + }, + { + "epoch": 0.238512, + "grad_norm": 0.6953125, + "learning_rate": 7.676451612903227e-05, + "loss": 0.1482, + "step": 14907 + }, + { + "epoch": 0.238528, + "grad_norm": 0.64453125, + "learning_rate": 7.676290322580647e-05, + "loss": 0.1732, + "step": 14908 + }, + { + "epoch": 0.238544, + "grad_norm": 0.89453125, + "learning_rate": 7.676129032258065e-05, + "loss": 0.1969, + "step": 14909 + }, + { + "epoch": 0.23856, + "grad_norm": 0.80859375, + "learning_rate": 7.675967741935485e-05, + "loss": 0.1558, + "step": 14910 + }, + { + "epoch": 0.238576, + "grad_norm": 0.86328125, + "learning_rate": 7.675806451612904e-05, + "loss": 0.2007, + "step": 14911 + }, + { + "epoch": 0.238592, + "grad_norm": 0.62890625, + "learning_rate": 7.675645161290322e-05, + "loss": 0.1671, + "step": 14912 + }, + { + "epoch": 0.238608, + "grad_norm": 1.015625, + "learning_rate": 7.675483870967742e-05, + "loss": 0.2014, + "step": 14913 + }, + { + "epoch": 0.238624, + "grad_norm": 0.91796875, + "learning_rate": 7.675322580645161e-05, + "loss": 0.1643, + "step": 14914 + }, + { + "epoch": 0.23864, + "grad_norm": 0.8203125, + "learning_rate": 7.675161290322581e-05, + "loss": 0.1873, + "step": 14915 + }, + { + "epoch": 0.238656, + "grad_norm": 0.84765625, + "learning_rate": 7.675e-05, + "loss": 0.2037, + "step": 14916 + }, + { + "epoch": 0.238672, + "grad_norm": 0.50390625, + "learning_rate": 7.67483870967742e-05, + "loss": 0.1497, + "step": 14917 + }, + { + "epoch": 0.238688, + "grad_norm": 0.6484375, + "learning_rate": 7.674677419354838e-05, + "loss": 0.1612, + "step": 14918 + }, + { + "epoch": 0.238704, + "grad_norm": 0.671875, + "learning_rate": 7.674516129032258e-05, + "loss": 0.1891, + "step": 14919 + }, + { + "epoch": 0.23872, + "grad_norm": 0.73046875, + "learning_rate": 7.674354838709678e-05, + "loss": 0.1884, + "step": 14920 + }, + { + "epoch": 0.238736, + "grad_norm": 1.03125, + "learning_rate": 7.674193548387098e-05, + "loss": 0.1986, + "step": 14921 + }, + { + "epoch": 0.238752, + "grad_norm": 1.046875, + "learning_rate": 7.674032258064517e-05, + "loss": 0.2226, + "step": 14922 + }, + { + "epoch": 0.238768, + "grad_norm": 0.78515625, + "learning_rate": 7.673870967741937e-05, + "loss": 0.1585, + "step": 14923 + }, + { + "epoch": 0.238784, + "grad_norm": 1.4765625, + "learning_rate": 7.673709677419355e-05, + "loss": 0.1919, + "step": 14924 + }, + { + "epoch": 0.2388, + "grad_norm": 0.75390625, + "learning_rate": 7.673548387096775e-05, + "loss": 0.173, + "step": 14925 + }, + { + "epoch": 0.238816, + "grad_norm": 0.6171875, + "learning_rate": 7.673387096774194e-05, + "loss": 0.1812, + "step": 14926 + }, + { + "epoch": 0.238832, + "grad_norm": 1.4921875, + "learning_rate": 7.673225806451614e-05, + "loss": 0.2004, + "step": 14927 + }, + { + "epoch": 0.238848, + "grad_norm": 0.59765625, + "learning_rate": 7.673064516129032e-05, + "loss": 0.146, + "step": 14928 + }, + { + "epoch": 0.238864, + "grad_norm": 0.98828125, + "learning_rate": 7.672903225806451e-05, + "loss": 0.1615, + "step": 14929 + }, + { + "epoch": 0.23888, + "grad_norm": 0.84765625, + "learning_rate": 7.672741935483871e-05, + "loss": 0.1781, + "step": 14930 + }, + { + "epoch": 0.238896, + "grad_norm": 0.75390625, + "learning_rate": 7.672580645161291e-05, + "loss": 0.1812, + "step": 14931 + }, + { + "epoch": 0.238912, + "grad_norm": 0.81640625, + "learning_rate": 7.672419354838711e-05, + "loss": 0.2127, + "step": 14932 + }, + { + "epoch": 0.238928, + "grad_norm": 0.578125, + "learning_rate": 7.67225806451613e-05, + "loss": 0.1572, + "step": 14933 + }, + { + "epoch": 0.238944, + "grad_norm": 0.8359375, + "learning_rate": 7.67209677419355e-05, + "loss": 0.1738, + "step": 14934 + }, + { + "epoch": 0.23896, + "grad_norm": 0.765625, + "learning_rate": 7.671935483870968e-05, + "loss": 0.1578, + "step": 14935 + }, + { + "epoch": 0.238976, + "grad_norm": 0.61328125, + "learning_rate": 7.671774193548388e-05, + "loss": 0.1529, + "step": 14936 + }, + { + "epoch": 0.238992, + "grad_norm": 0.70703125, + "learning_rate": 7.671612903225807e-05, + "loss": 0.178, + "step": 14937 + }, + { + "epoch": 0.239008, + "grad_norm": 0.671875, + "learning_rate": 7.671451612903227e-05, + "loss": 0.1853, + "step": 14938 + }, + { + "epoch": 0.239024, + "grad_norm": 0.46484375, + "learning_rate": 7.671290322580645e-05, + "loss": 0.1618, + "step": 14939 + }, + { + "epoch": 0.23904, + "grad_norm": 1.125, + "learning_rate": 7.671129032258065e-05, + "loss": 0.1929, + "step": 14940 + }, + { + "epoch": 0.239056, + "grad_norm": 0.87109375, + "learning_rate": 7.670967741935484e-05, + "loss": 0.139, + "step": 14941 + }, + { + "epoch": 0.239072, + "grad_norm": 0.77734375, + "learning_rate": 7.670806451612904e-05, + "loss": 0.1697, + "step": 14942 + }, + { + "epoch": 0.239088, + "grad_norm": 1.109375, + "learning_rate": 7.670645161290324e-05, + "loss": 0.1887, + "step": 14943 + }, + { + "epoch": 0.239104, + "grad_norm": 0.83203125, + "learning_rate": 7.670483870967742e-05, + "loss": 0.1466, + "step": 14944 + }, + { + "epoch": 0.23912, + "grad_norm": 0.75, + "learning_rate": 7.670322580645162e-05, + "loss": 0.1822, + "step": 14945 + }, + { + "epoch": 0.239136, + "grad_norm": 0.7109375, + "learning_rate": 7.670161290322581e-05, + "loss": 0.1403, + "step": 14946 + }, + { + "epoch": 0.239152, + "grad_norm": 0.53125, + "learning_rate": 7.670000000000001e-05, + "loss": 0.146, + "step": 14947 + }, + { + "epoch": 0.239168, + "grad_norm": 0.96875, + "learning_rate": 7.66983870967742e-05, + "loss": 0.1882, + "step": 14948 + }, + { + "epoch": 0.239184, + "grad_norm": 0.96484375, + "learning_rate": 7.66967741935484e-05, + "loss": 0.1526, + "step": 14949 + }, + { + "epoch": 0.2392, + "grad_norm": 1.234375, + "learning_rate": 7.669516129032258e-05, + "loss": 0.1735, + "step": 14950 + }, + { + "epoch": 0.239216, + "grad_norm": 0.7578125, + "learning_rate": 7.669354838709678e-05, + "loss": 0.216, + "step": 14951 + }, + { + "epoch": 0.239232, + "grad_norm": 0.71484375, + "learning_rate": 7.669193548387097e-05, + "loss": 0.1812, + "step": 14952 + }, + { + "epoch": 0.239248, + "grad_norm": 0.5859375, + "learning_rate": 7.669032258064516e-05, + "loss": 0.1487, + "step": 14953 + }, + { + "epoch": 0.239264, + "grad_norm": 0.65234375, + "learning_rate": 7.668870967741935e-05, + "loss": 0.1599, + "step": 14954 + }, + { + "epoch": 0.23928, + "grad_norm": 0.765625, + "learning_rate": 7.668709677419355e-05, + "loss": 0.1553, + "step": 14955 + }, + { + "epoch": 0.239296, + "grad_norm": 0.67578125, + "learning_rate": 7.668548387096775e-05, + "loss": 0.1615, + "step": 14956 + }, + { + "epoch": 0.239312, + "grad_norm": 1.359375, + "learning_rate": 7.668387096774195e-05, + "loss": 0.1918, + "step": 14957 + }, + { + "epoch": 0.239328, + "grad_norm": 0.57421875, + "learning_rate": 7.668225806451614e-05, + "loss": 0.1477, + "step": 14958 + }, + { + "epoch": 0.239344, + "grad_norm": 0.86328125, + "learning_rate": 7.668064516129032e-05, + "loss": 0.1637, + "step": 14959 + }, + { + "epoch": 0.23936, + "grad_norm": 0.58203125, + "learning_rate": 7.667903225806452e-05, + "loss": 0.1619, + "step": 14960 + }, + { + "epoch": 0.239376, + "grad_norm": 0.93359375, + "learning_rate": 7.667741935483871e-05, + "loss": 0.1738, + "step": 14961 + }, + { + "epoch": 0.239392, + "grad_norm": 1.28125, + "learning_rate": 7.667580645161291e-05, + "loss": 0.1565, + "step": 14962 + }, + { + "epoch": 0.239408, + "grad_norm": 1.125, + "learning_rate": 7.66741935483871e-05, + "loss": 0.2039, + "step": 14963 + }, + { + "epoch": 0.239424, + "grad_norm": 0.99609375, + "learning_rate": 7.667258064516129e-05, + "loss": 0.1609, + "step": 14964 + }, + { + "epoch": 0.23944, + "grad_norm": 0.74609375, + "learning_rate": 7.667096774193548e-05, + "loss": 0.2149, + "step": 14965 + }, + { + "epoch": 0.239456, + "grad_norm": 0.78515625, + "learning_rate": 7.666935483870968e-05, + "loss": 0.1521, + "step": 14966 + }, + { + "epoch": 0.239472, + "grad_norm": 0.8203125, + "learning_rate": 7.666774193548388e-05, + "loss": 0.1628, + "step": 14967 + }, + { + "epoch": 0.239488, + "grad_norm": 0.984375, + "learning_rate": 7.666612903225808e-05, + "loss": 0.1513, + "step": 14968 + }, + { + "epoch": 0.239504, + "grad_norm": 0.5546875, + "learning_rate": 7.666451612903226e-05, + "loss": 0.164, + "step": 14969 + }, + { + "epoch": 0.23952, + "grad_norm": 1.046875, + "learning_rate": 7.666290322580646e-05, + "loss": 0.1765, + "step": 14970 + }, + { + "epoch": 0.239536, + "grad_norm": 0.8671875, + "learning_rate": 7.666129032258065e-05, + "loss": 0.1915, + "step": 14971 + }, + { + "epoch": 0.239552, + "grad_norm": 0.859375, + "learning_rate": 7.665967741935485e-05, + "loss": 0.1991, + "step": 14972 + }, + { + "epoch": 0.239568, + "grad_norm": 0.88671875, + "learning_rate": 7.665806451612904e-05, + "loss": 0.1925, + "step": 14973 + }, + { + "epoch": 0.239584, + "grad_norm": 0.6953125, + "learning_rate": 7.665645161290322e-05, + "loss": 0.1744, + "step": 14974 + }, + { + "epoch": 0.2396, + "grad_norm": 0.5625, + "learning_rate": 7.665483870967742e-05, + "loss": 0.155, + "step": 14975 + }, + { + "epoch": 0.239616, + "grad_norm": 1.1171875, + "learning_rate": 7.665322580645161e-05, + "loss": 0.1716, + "step": 14976 + }, + { + "epoch": 0.239632, + "grad_norm": 0.7890625, + "learning_rate": 7.665161290322581e-05, + "loss": 0.1593, + "step": 14977 + }, + { + "epoch": 0.239648, + "grad_norm": 0.5390625, + "learning_rate": 7.664999999999999e-05, + "loss": 0.1324, + "step": 14978 + }, + { + "epoch": 0.239664, + "grad_norm": 1.1171875, + "learning_rate": 7.664838709677419e-05, + "loss": 0.1531, + "step": 14979 + }, + { + "epoch": 0.23968, + "grad_norm": 0.65625, + "learning_rate": 7.664677419354839e-05, + "loss": 0.1478, + "step": 14980 + }, + { + "epoch": 0.239696, + "grad_norm": 0.69921875, + "learning_rate": 7.664516129032259e-05, + "loss": 0.1792, + "step": 14981 + }, + { + "epoch": 0.239712, + "grad_norm": 0.921875, + "learning_rate": 7.664354838709678e-05, + "loss": 0.1825, + "step": 14982 + }, + { + "epoch": 0.239728, + "grad_norm": 0.96875, + "learning_rate": 7.664193548387098e-05, + "loss": 0.1806, + "step": 14983 + }, + { + "epoch": 0.239744, + "grad_norm": 0.8828125, + "learning_rate": 7.664032258064516e-05, + "loss": 0.1725, + "step": 14984 + }, + { + "epoch": 0.23976, + "grad_norm": 0.78515625, + "learning_rate": 7.663870967741936e-05, + "loss": 0.1744, + "step": 14985 + }, + { + "epoch": 0.239776, + "grad_norm": 0.79296875, + "learning_rate": 7.663709677419355e-05, + "loss": 0.1954, + "step": 14986 + }, + { + "epoch": 0.239792, + "grad_norm": 0.640625, + "learning_rate": 7.663548387096775e-05, + "loss": 0.1715, + "step": 14987 + }, + { + "epoch": 0.239808, + "grad_norm": 1.1640625, + "learning_rate": 7.663387096774193e-05, + "loss": 0.1839, + "step": 14988 + }, + { + "epoch": 0.239824, + "grad_norm": 0.66015625, + "learning_rate": 7.663225806451613e-05, + "loss": 0.1509, + "step": 14989 + }, + { + "epoch": 0.23984, + "grad_norm": 0.828125, + "learning_rate": 7.663064516129032e-05, + "loss": 0.1505, + "step": 14990 + }, + { + "epoch": 0.239856, + "grad_norm": 0.90234375, + "learning_rate": 7.662903225806452e-05, + "loss": 0.1367, + "step": 14991 + }, + { + "epoch": 0.239872, + "grad_norm": 0.79296875, + "learning_rate": 7.662741935483872e-05, + "loss": 0.1439, + "step": 14992 + }, + { + "epoch": 0.239888, + "grad_norm": 1.09375, + "learning_rate": 7.66258064516129e-05, + "loss": 0.1544, + "step": 14993 + }, + { + "epoch": 0.239904, + "grad_norm": 0.63671875, + "learning_rate": 7.66241935483871e-05, + "loss": 0.1539, + "step": 14994 + }, + { + "epoch": 0.23992, + "grad_norm": 0.58203125, + "learning_rate": 7.662258064516129e-05, + "loss": 0.1345, + "step": 14995 + }, + { + "epoch": 0.239936, + "grad_norm": 0.890625, + "learning_rate": 7.662096774193549e-05, + "loss": 0.1817, + "step": 14996 + }, + { + "epoch": 0.239952, + "grad_norm": 0.85546875, + "learning_rate": 7.661935483870968e-05, + "loss": 0.1247, + "step": 14997 + }, + { + "epoch": 0.239968, + "grad_norm": 0.921875, + "learning_rate": 7.661774193548388e-05, + "loss": 0.229, + "step": 14998 + }, + { + "epoch": 0.239984, + "grad_norm": 0.59375, + "learning_rate": 7.661612903225806e-05, + "loss": 0.1663, + "step": 14999 + }, + { + "epoch": 0.24, + "grad_norm": 1.03125, + "learning_rate": 7.661451612903226e-05, + "loss": 0.185, + "step": 15000 + }, + { + "epoch": 0.240016, + "grad_norm": 0.8046875, + "learning_rate": 7.661290322580645e-05, + "loss": 0.1525, + "step": 15001 + }, + { + "epoch": 0.240032, + "grad_norm": 0.443359375, + "learning_rate": 7.661129032258065e-05, + "loss": 0.1523, + "step": 15002 + }, + { + "epoch": 0.240048, + "grad_norm": 0.82421875, + "learning_rate": 7.660967741935485e-05, + "loss": 0.1954, + "step": 15003 + }, + { + "epoch": 0.240064, + "grad_norm": 0.765625, + "learning_rate": 7.660806451612905e-05, + "loss": 0.1884, + "step": 15004 + }, + { + "epoch": 0.24008, + "grad_norm": 0.80859375, + "learning_rate": 7.660645161290323e-05, + "loss": 0.1325, + "step": 15005 + }, + { + "epoch": 0.240096, + "grad_norm": 0.7578125, + "learning_rate": 7.660483870967742e-05, + "loss": 0.167, + "step": 15006 + }, + { + "epoch": 0.240112, + "grad_norm": 0.76171875, + "learning_rate": 7.660322580645162e-05, + "loss": 0.1644, + "step": 15007 + }, + { + "epoch": 0.240128, + "grad_norm": 1.2578125, + "learning_rate": 7.66016129032258e-05, + "loss": 0.1656, + "step": 15008 + }, + { + "epoch": 0.240144, + "grad_norm": 0.90625, + "learning_rate": 7.66e-05, + "loss": 0.2127, + "step": 15009 + }, + { + "epoch": 0.24016, + "grad_norm": 0.984375, + "learning_rate": 7.659838709677419e-05, + "loss": 0.1825, + "step": 15010 + }, + { + "epoch": 0.240176, + "grad_norm": 1.0078125, + "learning_rate": 7.659677419354839e-05, + "loss": 0.1687, + "step": 15011 + }, + { + "epoch": 0.240192, + "grad_norm": 0.79296875, + "learning_rate": 7.659516129032258e-05, + "loss": 0.187, + "step": 15012 + }, + { + "epoch": 0.240208, + "grad_norm": 0.6484375, + "learning_rate": 7.659354838709678e-05, + "loss": 0.1435, + "step": 15013 + }, + { + "epoch": 0.240224, + "grad_norm": 1.0703125, + "learning_rate": 7.659193548387096e-05, + "loss": 0.167, + "step": 15014 + }, + { + "epoch": 0.24024, + "grad_norm": 0.640625, + "learning_rate": 7.659032258064516e-05, + "loss": 0.2134, + "step": 15015 + }, + { + "epoch": 0.240256, + "grad_norm": 0.96484375, + "learning_rate": 7.658870967741936e-05, + "loss": 0.1664, + "step": 15016 + }, + { + "epoch": 0.240272, + "grad_norm": 0.70703125, + "learning_rate": 7.658709677419356e-05, + "loss": 0.1853, + "step": 15017 + }, + { + "epoch": 0.240288, + "grad_norm": 0.90625, + "learning_rate": 7.658548387096775e-05, + "loss": 0.1552, + "step": 15018 + }, + { + "epoch": 0.240304, + "grad_norm": 0.7109375, + "learning_rate": 7.658387096774195e-05, + "loss": 0.1974, + "step": 15019 + }, + { + "epoch": 0.24032, + "grad_norm": 0.66796875, + "learning_rate": 7.658225806451613e-05, + "loss": 0.149, + "step": 15020 + }, + { + "epoch": 0.240336, + "grad_norm": 0.97265625, + "learning_rate": 7.658064516129032e-05, + "loss": 0.1757, + "step": 15021 + }, + { + "epoch": 0.240352, + "grad_norm": 0.75390625, + "learning_rate": 7.657903225806452e-05, + "loss": 0.1914, + "step": 15022 + }, + { + "epoch": 0.240368, + "grad_norm": 0.95703125, + "learning_rate": 7.65774193548387e-05, + "loss": 0.1602, + "step": 15023 + }, + { + "epoch": 0.240384, + "grad_norm": 0.8125, + "learning_rate": 7.65758064516129e-05, + "loss": 0.1961, + "step": 15024 + }, + { + "epoch": 0.2404, + "grad_norm": 0.69140625, + "learning_rate": 7.657419354838709e-05, + "loss": 0.163, + "step": 15025 + }, + { + "epoch": 0.240416, + "grad_norm": 0.65625, + "learning_rate": 7.657258064516129e-05, + "loss": 0.167, + "step": 15026 + }, + { + "epoch": 0.240432, + "grad_norm": 0.77734375, + "learning_rate": 7.657096774193549e-05, + "loss": 0.1846, + "step": 15027 + }, + { + "epoch": 0.240448, + "grad_norm": 0.6640625, + "learning_rate": 7.656935483870969e-05, + "loss": 0.1937, + "step": 15028 + }, + { + "epoch": 0.240464, + "grad_norm": 0.52734375, + "learning_rate": 7.656774193548388e-05, + "loss": 0.1669, + "step": 15029 + }, + { + "epoch": 0.24048, + "grad_norm": 0.76171875, + "learning_rate": 7.656612903225808e-05, + "loss": 0.1603, + "step": 15030 + }, + { + "epoch": 0.240496, + "grad_norm": 0.6484375, + "learning_rate": 7.656451612903226e-05, + "loss": 0.1717, + "step": 15031 + }, + { + "epoch": 0.240512, + "grad_norm": 0.68359375, + "learning_rate": 7.656290322580646e-05, + "loss": 0.1671, + "step": 15032 + }, + { + "epoch": 0.240528, + "grad_norm": 0.77734375, + "learning_rate": 7.656129032258065e-05, + "loss": 0.1469, + "step": 15033 + }, + { + "epoch": 0.240544, + "grad_norm": 0.71484375, + "learning_rate": 7.655967741935485e-05, + "loss": 0.1698, + "step": 15034 + }, + { + "epoch": 0.24056, + "grad_norm": 0.8515625, + "learning_rate": 7.655806451612903e-05, + "loss": 0.1924, + "step": 15035 + }, + { + "epoch": 0.240576, + "grad_norm": 0.66015625, + "learning_rate": 7.655645161290323e-05, + "loss": 0.1458, + "step": 15036 + }, + { + "epoch": 0.240592, + "grad_norm": 0.7890625, + "learning_rate": 7.655483870967742e-05, + "loss": 0.1717, + "step": 15037 + }, + { + "epoch": 0.240608, + "grad_norm": 1.109375, + "learning_rate": 7.655322580645162e-05, + "loss": 0.1794, + "step": 15038 + }, + { + "epoch": 0.240624, + "grad_norm": 0.6640625, + "learning_rate": 7.65516129032258e-05, + "loss": 0.138, + "step": 15039 + }, + { + "epoch": 0.24064, + "grad_norm": 0.9921875, + "learning_rate": 7.655e-05, + "loss": 0.2053, + "step": 15040 + }, + { + "epoch": 0.240656, + "grad_norm": 1.5390625, + "learning_rate": 7.65483870967742e-05, + "loss": 0.1844, + "step": 15041 + }, + { + "epoch": 0.240672, + "grad_norm": 0.8984375, + "learning_rate": 7.654677419354839e-05, + "loss": 0.1691, + "step": 15042 + }, + { + "epoch": 0.240688, + "grad_norm": 0.7578125, + "learning_rate": 7.654516129032259e-05, + "loss": 0.1937, + "step": 15043 + }, + { + "epoch": 0.240704, + "grad_norm": 0.6953125, + "learning_rate": 7.654354838709678e-05, + "loss": 0.1604, + "step": 15044 + }, + { + "epoch": 0.24072, + "grad_norm": 0.80859375, + "learning_rate": 7.654193548387098e-05, + "loss": 0.1636, + "step": 15045 + }, + { + "epoch": 0.240736, + "grad_norm": 0.9375, + "learning_rate": 7.654032258064516e-05, + "loss": 0.1434, + "step": 15046 + }, + { + "epoch": 0.240752, + "grad_norm": 0.61328125, + "learning_rate": 7.653870967741936e-05, + "loss": 0.1454, + "step": 15047 + }, + { + "epoch": 0.240768, + "grad_norm": 1.2734375, + "learning_rate": 7.653709677419355e-05, + "loss": 0.1561, + "step": 15048 + }, + { + "epoch": 0.240784, + "grad_norm": 1.0234375, + "learning_rate": 7.653548387096775e-05, + "loss": 0.2137, + "step": 15049 + }, + { + "epoch": 0.2408, + "grad_norm": 1.0546875, + "learning_rate": 7.653387096774193e-05, + "loss": 0.1937, + "step": 15050 + }, + { + "epoch": 0.240816, + "grad_norm": 0.6328125, + "learning_rate": 7.653225806451613e-05, + "loss": 0.1698, + "step": 15051 + }, + { + "epoch": 0.240832, + "grad_norm": 0.73046875, + "learning_rate": 7.653064516129033e-05, + "loss": 0.207, + "step": 15052 + }, + { + "epoch": 0.240848, + "grad_norm": 1.1953125, + "learning_rate": 7.652903225806452e-05, + "loss": 0.1888, + "step": 15053 + }, + { + "epoch": 0.240864, + "grad_norm": 0.8125, + "learning_rate": 7.652741935483872e-05, + "loss": 0.1639, + "step": 15054 + }, + { + "epoch": 0.24088, + "grad_norm": 1.1015625, + "learning_rate": 7.65258064516129e-05, + "loss": 0.2036, + "step": 15055 + }, + { + "epoch": 0.240896, + "grad_norm": 0.6015625, + "learning_rate": 7.65241935483871e-05, + "loss": 0.1362, + "step": 15056 + }, + { + "epoch": 0.240912, + "grad_norm": 0.74609375, + "learning_rate": 7.652258064516129e-05, + "loss": 0.1667, + "step": 15057 + }, + { + "epoch": 0.240928, + "grad_norm": 0.921875, + "learning_rate": 7.652096774193549e-05, + "loss": 0.1513, + "step": 15058 + }, + { + "epoch": 0.240944, + "grad_norm": 0.625, + "learning_rate": 7.651935483870967e-05, + "loss": 0.1864, + "step": 15059 + }, + { + "epoch": 0.24096, + "grad_norm": 0.66015625, + "learning_rate": 7.651774193548387e-05, + "loss": 0.1776, + "step": 15060 + }, + { + "epoch": 0.240976, + "grad_norm": 0.7890625, + "learning_rate": 7.651612903225806e-05, + "loss": 0.1747, + "step": 15061 + }, + { + "epoch": 0.240992, + "grad_norm": 0.59375, + "learning_rate": 7.651451612903226e-05, + "loss": 0.1764, + "step": 15062 + }, + { + "epoch": 0.241008, + "grad_norm": 1.4140625, + "learning_rate": 7.651290322580646e-05, + "loss": 0.1397, + "step": 15063 + }, + { + "epoch": 0.241024, + "grad_norm": 1.109375, + "learning_rate": 7.651129032258066e-05, + "loss": 0.1864, + "step": 15064 + }, + { + "epoch": 0.24104, + "grad_norm": 0.83984375, + "learning_rate": 7.650967741935485e-05, + "loss": 0.1752, + "step": 15065 + }, + { + "epoch": 0.241056, + "grad_norm": 1.5234375, + "learning_rate": 7.650806451612905e-05, + "loss": 0.1684, + "step": 15066 + }, + { + "epoch": 0.241072, + "grad_norm": 0.8046875, + "learning_rate": 7.650645161290323e-05, + "loss": 0.134, + "step": 15067 + }, + { + "epoch": 0.241088, + "grad_norm": 1.3984375, + "learning_rate": 7.650483870967742e-05, + "loss": 0.1371, + "step": 15068 + }, + { + "epoch": 0.241104, + "grad_norm": 1.109375, + "learning_rate": 7.650322580645162e-05, + "loss": 0.1808, + "step": 15069 + }, + { + "epoch": 0.24112, + "grad_norm": 1.15625, + "learning_rate": 7.65016129032258e-05, + "loss": 0.1602, + "step": 15070 + }, + { + "epoch": 0.241136, + "grad_norm": 0.921875, + "learning_rate": 7.65e-05, + "loss": 0.1802, + "step": 15071 + }, + { + "epoch": 0.241152, + "grad_norm": 0.69140625, + "learning_rate": 7.649838709677419e-05, + "loss": 0.164, + "step": 15072 + }, + { + "epoch": 0.241168, + "grad_norm": 0.625, + "learning_rate": 7.649677419354839e-05, + "loss": 0.1306, + "step": 15073 + }, + { + "epoch": 0.241184, + "grad_norm": 0.6328125, + "learning_rate": 7.649516129032257e-05, + "loss": 0.1278, + "step": 15074 + }, + { + "epoch": 0.2412, + "grad_norm": 0.6796875, + "learning_rate": 7.649354838709677e-05, + "loss": 0.1701, + "step": 15075 + }, + { + "epoch": 0.241216, + "grad_norm": 0.78515625, + "learning_rate": 7.649193548387097e-05, + "loss": 0.1872, + "step": 15076 + }, + { + "epoch": 0.241232, + "grad_norm": 0.6875, + "learning_rate": 7.649032258064517e-05, + "loss": 0.1507, + "step": 15077 + }, + { + "epoch": 0.241248, + "grad_norm": 0.640625, + "learning_rate": 7.648870967741936e-05, + "loss": 0.158, + "step": 15078 + }, + { + "epoch": 0.241264, + "grad_norm": 0.92578125, + "learning_rate": 7.648709677419356e-05, + "loss": 0.1835, + "step": 15079 + }, + { + "epoch": 0.24128, + "grad_norm": 0.7890625, + "learning_rate": 7.648548387096775e-05, + "loss": 0.1918, + "step": 15080 + }, + { + "epoch": 0.241296, + "grad_norm": 1.1484375, + "learning_rate": 7.648387096774194e-05, + "loss": 0.1349, + "step": 15081 + }, + { + "epoch": 0.241312, + "grad_norm": 1.3515625, + "learning_rate": 7.648225806451613e-05, + "loss": 0.2478, + "step": 15082 + }, + { + "epoch": 0.241328, + "grad_norm": 0.66796875, + "learning_rate": 7.648064516129032e-05, + "loss": 0.1772, + "step": 15083 + }, + { + "epoch": 0.241344, + "grad_norm": 0.97265625, + "learning_rate": 7.647903225806452e-05, + "loss": 0.1722, + "step": 15084 + }, + { + "epoch": 0.24136, + "grad_norm": 0.76171875, + "learning_rate": 7.64774193548387e-05, + "loss": 0.2024, + "step": 15085 + }, + { + "epoch": 0.241376, + "grad_norm": 0.625, + "learning_rate": 7.64758064516129e-05, + "loss": 0.1516, + "step": 15086 + }, + { + "epoch": 0.241392, + "grad_norm": 0.609375, + "learning_rate": 7.64741935483871e-05, + "loss": 0.1667, + "step": 15087 + }, + { + "epoch": 0.241408, + "grad_norm": 0.89453125, + "learning_rate": 7.64725806451613e-05, + "loss": 0.2136, + "step": 15088 + }, + { + "epoch": 0.241424, + "grad_norm": 1.3203125, + "learning_rate": 7.647096774193549e-05, + "loss": 0.2195, + "step": 15089 + }, + { + "epoch": 0.24144, + "grad_norm": 0.94140625, + "learning_rate": 7.646935483870969e-05, + "loss": 0.1604, + "step": 15090 + }, + { + "epoch": 0.241456, + "grad_norm": 0.8515625, + "learning_rate": 7.646774193548387e-05, + "loss": 0.175, + "step": 15091 + }, + { + "epoch": 0.241472, + "grad_norm": 0.59375, + "learning_rate": 7.646612903225807e-05, + "loss": 0.1487, + "step": 15092 + }, + { + "epoch": 0.241488, + "grad_norm": 0.76171875, + "learning_rate": 7.646451612903226e-05, + "loss": 0.1865, + "step": 15093 + }, + { + "epoch": 0.241504, + "grad_norm": 0.734375, + "learning_rate": 7.646290322580646e-05, + "loss": 0.1477, + "step": 15094 + }, + { + "epoch": 0.24152, + "grad_norm": 0.71875, + "learning_rate": 7.646129032258064e-05, + "loss": 0.1763, + "step": 15095 + }, + { + "epoch": 0.241536, + "grad_norm": 1.3359375, + "learning_rate": 7.645967741935484e-05, + "loss": 0.1554, + "step": 15096 + }, + { + "epoch": 0.241552, + "grad_norm": 0.86328125, + "learning_rate": 7.645806451612903e-05, + "loss": 0.1843, + "step": 15097 + }, + { + "epoch": 0.241568, + "grad_norm": 0.83984375, + "learning_rate": 7.645645161290323e-05, + "loss": 0.1651, + "step": 15098 + }, + { + "epoch": 0.241584, + "grad_norm": 0.859375, + "learning_rate": 7.645483870967743e-05, + "loss": 0.1693, + "step": 15099 + }, + { + "epoch": 0.2416, + "grad_norm": 1.1015625, + "learning_rate": 7.645322580645162e-05, + "loss": 0.1728, + "step": 15100 + }, + { + "epoch": 0.241616, + "grad_norm": 0.6484375, + "learning_rate": 7.645161290322582e-05, + "loss": 0.1608, + "step": 15101 + }, + { + "epoch": 0.241632, + "grad_norm": 0.58984375, + "learning_rate": 7.645e-05, + "loss": 0.1772, + "step": 15102 + }, + { + "epoch": 0.241648, + "grad_norm": 0.97265625, + "learning_rate": 7.64483870967742e-05, + "loss": 0.1933, + "step": 15103 + }, + { + "epoch": 0.241664, + "grad_norm": 0.546875, + "learning_rate": 7.644677419354839e-05, + "loss": 0.1716, + "step": 15104 + }, + { + "epoch": 0.24168, + "grad_norm": 0.94140625, + "learning_rate": 7.644516129032259e-05, + "loss": 0.1917, + "step": 15105 + }, + { + "epoch": 0.241696, + "grad_norm": 0.8828125, + "learning_rate": 7.644354838709677e-05, + "loss": 0.1944, + "step": 15106 + }, + { + "epoch": 0.241712, + "grad_norm": 0.91796875, + "learning_rate": 7.644193548387097e-05, + "loss": 0.1894, + "step": 15107 + }, + { + "epoch": 0.241728, + "grad_norm": 1.046875, + "learning_rate": 7.644032258064516e-05, + "loss": 0.1532, + "step": 15108 + }, + { + "epoch": 0.241744, + "grad_norm": 0.875, + "learning_rate": 7.643870967741936e-05, + "loss": 0.1784, + "step": 15109 + }, + { + "epoch": 0.24176, + "grad_norm": 1.125, + "learning_rate": 7.643709677419354e-05, + "loss": 0.1972, + "step": 15110 + }, + { + "epoch": 0.241776, + "grad_norm": 0.67578125, + "learning_rate": 7.643548387096774e-05, + "loss": 0.1653, + "step": 15111 + }, + { + "epoch": 0.241792, + "grad_norm": 0.9765625, + "learning_rate": 7.643387096774194e-05, + "loss": 0.1573, + "step": 15112 + }, + { + "epoch": 0.241808, + "grad_norm": 1.0234375, + "learning_rate": 7.643225806451614e-05, + "loss": 0.1764, + "step": 15113 + }, + { + "epoch": 0.241824, + "grad_norm": 0.796875, + "learning_rate": 7.643064516129033e-05, + "loss": 0.1813, + "step": 15114 + }, + { + "epoch": 0.24184, + "grad_norm": 0.8359375, + "learning_rate": 7.642903225806452e-05, + "loss": 0.191, + "step": 15115 + }, + { + "epoch": 0.241856, + "grad_norm": 0.8125, + "learning_rate": 7.642741935483872e-05, + "loss": 0.1592, + "step": 15116 + }, + { + "epoch": 0.241872, + "grad_norm": 0.83203125, + "learning_rate": 7.64258064516129e-05, + "loss": 0.1779, + "step": 15117 + }, + { + "epoch": 0.241888, + "grad_norm": 0.8828125, + "learning_rate": 7.64241935483871e-05, + "loss": 0.1977, + "step": 15118 + }, + { + "epoch": 0.241904, + "grad_norm": 0.87890625, + "learning_rate": 7.642258064516129e-05, + "loss": 0.1564, + "step": 15119 + }, + { + "epoch": 0.24192, + "grad_norm": 0.69140625, + "learning_rate": 7.642096774193549e-05, + "loss": 0.1704, + "step": 15120 + }, + { + "epoch": 0.241936, + "grad_norm": 0.5859375, + "learning_rate": 7.641935483870967e-05, + "loss": 0.1652, + "step": 15121 + }, + { + "epoch": 0.241952, + "grad_norm": 0.82421875, + "learning_rate": 7.641774193548387e-05, + "loss": 0.1695, + "step": 15122 + }, + { + "epoch": 0.241968, + "grad_norm": 0.63671875, + "learning_rate": 7.641612903225807e-05, + "loss": 0.1714, + "step": 15123 + }, + { + "epoch": 0.241984, + "grad_norm": 0.83203125, + "learning_rate": 7.641451612903227e-05, + "loss": 0.1564, + "step": 15124 + }, + { + "epoch": 0.242, + "grad_norm": 1.2109375, + "learning_rate": 7.641290322580646e-05, + "loss": 0.1795, + "step": 15125 + }, + { + "epoch": 0.242016, + "grad_norm": 0.65625, + "learning_rate": 7.641129032258066e-05, + "loss": 0.1619, + "step": 15126 + }, + { + "epoch": 0.242032, + "grad_norm": 0.51953125, + "learning_rate": 7.640967741935484e-05, + "loss": 0.1494, + "step": 15127 + }, + { + "epoch": 0.242048, + "grad_norm": 0.69921875, + "learning_rate": 7.640806451612904e-05, + "loss": 0.188, + "step": 15128 + }, + { + "epoch": 0.242064, + "grad_norm": 0.984375, + "learning_rate": 7.640645161290323e-05, + "loss": 0.1692, + "step": 15129 + }, + { + "epoch": 0.24208, + "grad_norm": 1.0703125, + "learning_rate": 7.640483870967742e-05, + "loss": 0.186, + "step": 15130 + }, + { + "epoch": 0.242096, + "grad_norm": 0.99609375, + "learning_rate": 7.640322580645161e-05, + "loss": 0.2448, + "step": 15131 + }, + { + "epoch": 0.242112, + "grad_norm": 0.92578125, + "learning_rate": 7.64016129032258e-05, + "loss": 0.183, + "step": 15132 + }, + { + "epoch": 0.242128, + "grad_norm": 0.828125, + "learning_rate": 7.64e-05, + "loss": 0.2106, + "step": 15133 + }, + { + "epoch": 0.242144, + "grad_norm": 0.85546875, + "learning_rate": 7.63983870967742e-05, + "loss": 0.1551, + "step": 15134 + }, + { + "epoch": 0.24216, + "grad_norm": 1.7421875, + "learning_rate": 7.639677419354839e-05, + "loss": 0.2284, + "step": 15135 + }, + { + "epoch": 0.242176, + "grad_norm": 0.671875, + "learning_rate": 7.639516129032259e-05, + "loss": 0.1397, + "step": 15136 + }, + { + "epoch": 0.242192, + "grad_norm": 0.6328125, + "learning_rate": 7.639354838709679e-05, + "loss": 0.1805, + "step": 15137 + }, + { + "epoch": 0.242208, + "grad_norm": 0.98828125, + "learning_rate": 7.639193548387097e-05, + "loss": 0.1486, + "step": 15138 + }, + { + "epoch": 0.242224, + "grad_norm": 0.9140625, + "learning_rate": 7.639032258064517e-05, + "loss": 0.176, + "step": 15139 + }, + { + "epoch": 0.24224, + "grad_norm": 1.0703125, + "learning_rate": 7.638870967741936e-05, + "loss": 0.1414, + "step": 15140 + }, + { + "epoch": 0.242256, + "grad_norm": 0.69921875, + "learning_rate": 7.638709677419356e-05, + "loss": 0.1605, + "step": 15141 + }, + { + "epoch": 0.242272, + "grad_norm": 1.0859375, + "learning_rate": 7.638548387096774e-05, + "loss": 0.1373, + "step": 15142 + }, + { + "epoch": 0.242288, + "grad_norm": 0.796875, + "learning_rate": 7.638387096774194e-05, + "loss": 0.1653, + "step": 15143 + }, + { + "epoch": 0.242304, + "grad_norm": 0.6875, + "learning_rate": 7.638225806451613e-05, + "loss": 0.1682, + "step": 15144 + }, + { + "epoch": 0.24232, + "grad_norm": 0.67578125, + "learning_rate": 7.638064516129033e-05, + "loss": 0.1614, + "step": 15145 + }, + { + "epoch": 0.242336, + "grad_norm": 0.69140625, + "learning_rate": 7.637903225806451e-05, + "loss": 0.1769, + "step": 15146 + }, + { + "epoch": 0.242352, + "grad_norm": 0.69140625, + "learning_rate": 7.637741935483871e-05, + "loss": 0.1424, + "step": 15147 + }, + { + "epoch": 0.242368, + "grad_norm": 1.15625, + "learning_rate": 7.637580645161291e-05, + "loss": 0.1534, + "step": 15148 + }, + { + "epoch": 0.242384, + "grad_norm": 0.625, + "learning_rate": 7.63741935483871e-05, + "loss": 0.1581, + "step": 15149 + }, + { + "epoch": 0.2424, + "grad_norm": 0.9453125, + "learning_rate": 7.63725806451613e-05, + "loss": 0.1644, + "step": 15150 + }, + { + "epoch": 0.242416, + "grad_norm": 0.80078125, + "learning_rate": 7.637096774193549e-05, + "loss": 0.1342, + "step": 15151 + }, + { + "epoch": 0.242432, + "grad_norm": 0.81640625, + "learning_rate": 7.636935483870968e-05, + "loss": 0.1714, + "step": 15152 + }, + { + "epoch": 0.242448, + "grad_norm": 0.88671875, + "learning_rate": 7.636774193548387e-05, + "loss": 0.1558, + "step": 15153 + }, + { + "epoch": 0.242464, + "grad_norm": 0.7109375, + "learning_rate": 7.636612903225807e-05, + "loss": 0.1704, + "step": 15154 + }, + { + "epoch": 0.24248, + "grad_norm": 1.046875, + "learning_rate": 7.636451612903226e-05, + "loss": 0.2171, + "step": 15155 + }, + { + "epoch": 0.242496, + "grad_norm": 1.109375, + "learning_rate": 7.636290322580646e-05, + "loss": 0.1781, + "step": 15156 + }, + { + "epoch": 0.242512, + "grad_norm": 1.109375, + "learning_rate": 7.636129032258064e-05, + "loss": 0.152, + "step": 15157 + }, + { + "epoch": 0.242528, + "grad_norm": 0.66796875, + "learning_rate": 7.635967741935484e-05, + "loss": 0.1465, + "step": 15158 + }, + { + "epoch": 0.242544, + "grad_norm": 0.62890625, + "learning_rate": 7.635806451612904e-05, + "loss": 0.162, + "step": 15159 + }, + { + "epoch": 0.24256, + "grad_norm": 1.09375, + "learning_rate": 7.635645161290324e-05, + "loss": 0.1445, + "step": 15160 + }, + { + "epoch": 0.242576, + "grad_norm": 0.71484375, + "learning_rate": 7.635483870967743e-05, + "loss": 0.1529, + "step": 15161 + }, + { + "epoch": 0.242592, + "grad_norm": 0.796875, + "learning_rate": 7.635322580645161e-05, + "loss": 0.1556, + "step": 15162 + }, + { + "epoch": 0.242608, + "grad_norm": 1.1171875, + "learning_rate": 7.635161290322581e-05, + "loss": 0.162, + "step": 15163 + }, + { + "epoch": 0.242624, + "grad_norm": 0.73046875, + "learning_rate": 7.635e-05, + "loss": 0.1688, + "step": 15164 + }, + { + "epoch": 0.24264, + "grad_norm": 0.8046875, + "learning_rate": 7.63483870967742e-05, + "loss": 0.1783, + "step": 15165 + }, + { + "epoch": 0.242656, + "grad_norm": 1.1171875, + "learning_rate": 7.634677419354838e-05, + "loss": 0.2124, + "step": 15166 + }, + { + "epoch": 0.242672, + "grad_norm": 0.77734375, + "learning_rate": 7.634516129032258e-05, + "loss": 0.176, + "step": 15167 + }, + { + "epoch": 0.242688, + "grad_norm": 0.71875, + "learning_rate": 7.634354838709677e-05, + "loss": 0.173, + "step": 15168 + }, + { + "epoch": 0.242704, + "grad_norm": 0.6484375, + "learning_rate": 7.634193548387097e-05, + "loss": 0.1765, + "step": 15169 + }, + { + "epoch": 0.24272, + "grad_norm": 0.72265625, + "learning_rate": 7.634032258064516e-05, + "loss": 0.1855, + "step": 15170 + }, + { + "epoch": 0.242736, + "grad_norm": 0.765625, + "learning_rate": 7.633870967741936e-05, + "loss": 0.1789, + "step": 15171 + }, + { + "epoch": 0.242752, + "grad_norm": 0.765625, + "learning_rate": 7.633709677419356e-05, + "loss": 0.1434, + "step": 15172 + }, + { + "epoch": 0.242768, + "grad_norm": 0.85546875, + "learning_rate": 7.633548387096776e-05, + "loss": 0.1596, + "step": 15173 + }, + { + "epoch": 0.242784, + "grad_norm": 0.9140625, + "learning_rate": 7.633387096774194e-05, + "loss": 0.1603, + "step": 15174 + }, + { + "epoch": 0.2428, + "grad_norm": 0.65234375, + "learning_rate": 7.633225806451614e-05, + "loss": 0.1262, + "step": 15175 + }, + { + "epoch": 0.242816, + "grad_norm": 0.7421875, + "learning_rate": 7.633064516129033e-05, + "loss": 0.1694, + "step": 15176 + }, + { + "epoch": 0.242832, + "grad_norm": 0.69921875, + "learning_rate": 7.632903225806451e-05, + "loss": 0.1799, + "step": 15177 + }, + { + "epoch": 0.242848, + "grad_norm": 0.94921875, + "learning_rate": 7.632741935483871e-05, + "loss": 0.1892, + "step": 15178 + }, + { + "epoch": 0.242864, + "grad_norm": 0.80859375, + "learning_rate": 7.63258064516129e-05, + "loss": 0.1586, + "step": 15179 + }, + { + "epoch": 0.24288, + "grad_norm": 0.8046875, + "learning_rate": 7.63241935483871e-05, + "loss": 0.2005, + "step": 15180 + }, + { + "epoch": 0.242896, + "grad_norm": 0.85546875, + "learning_rate": 7.632258064516128e-05, + "loss": 0.1622, + "step": 15181 + }, + { + "epoch": 0.242912, + "grad_norm": 0.59765625, + "learning_rate": 7.632096774193548e-05, + "loss": 0.1397, + "step": 15182 + }, + { + "epoch": 0.242928, + "grad_norm": 1.234375, + "learning_rate": 7.631935483870968e-05, + "loss": 0.1896, + "step": 15183 + }, + { + "epoch": 0.242944, + "grad_norm": 1.1015625, + "learning_rate": 7.631774193548388e-05, + "loss": 0.1895, + "step": 15184 + }, + { + "epoch": 0.24296, + "grad_norm": 0.77734375, + "learning_rate": 7.631612903225807e-05, + "loss": 0.2009, + "step": 15185 + }, + { + "epoch": 0.242976, + "grad_norm": 0.96875, + "learning_rate": 7.631451612903227e-05, + "loss": 0.1761, + "step": 15186 + }, + { + "epoch": 0.242992, + "grad_norm": 0.734375, + "learning_rate": 7.631290322580646e-05, + "loss": 0.1882, + "step": 15187 + }, + { + "epoch": 0.243008, + "grad_norm": 1.03125, + "learning_rate": 7.631129032258065e-05, + "loss": 0.1769, + "step": 15188 + }, + { + "epoch": 0.243024, + "grad_norm": 1.0546875, + "learning_rate": 7.630967741935484e-05, + "loss": 0.143, + "step": 15189 + }, + { + "epoch": 0.24304, + "grad_norm": 0.77734375, + "learning_rate": 7.630806451612904e-05, + "loss": 0.1712, + "step": 15190 + }, + { + "epoch": 0.243056, + "grad_norm": 0.75, + "learning_rate": 7.630645161290323e-05, + "loss": 0.1665, + "step": 15191 + }, + { + "epoch": 0.243072, + "grad_norm": 0.68359375, + "learning_rate": 7.630483870967741e-05, + "loss": 0.1644, + "step": 15192 + }, + { + "epoch": 0.243088, + "grad_norm": 0.61328125, + "learning_rate": 7.630322580645161e-05, + "loss": 0.1591, + "step": 15193 + }, + { + "epoch": 0.243104, + "grad_norm": 0.68359375, + "learning_rate": 7.630161290322581e-05, + "loss": 0.1259, + "step": 15194 + }, + { + "epoch": 0.24312, + "grad_norm": 0.84375, + "learning_rate": 7.630000000000001e-05, + "loss": 0.1872, + "step": 15195 + }, + { + "epoch": 0.243136, + "grad_norm": 0.57421875, + "learning_rate": 7.62983870967742e-05, + "loss": 0.1304, + "step": 15196 + }, + { + "epoch": 0.243152, + "grad_norm": 0.68359375, + "learning_rate": 7.62967741935484e-05, + "loss": 0.1688, + "step": 15197 + }, + { + "epoch": 0.243168, + "grad_norm": 0.58203125, + "learning_rate": 7.629516129032258e-05, + "loss": 0.1401, + "step": 15198 + }, + { + "epoch": 0.243184, + "grad_norm": 0.66015625, + "learning_rate": 7.629354838709678e-05, + "loss": 0.1799, + "step": 15199 + }, + { + "epoch": 0.2432, + "grad_norm": 0.6875, + "learning_rate": 7.629193548387097e-05, + "loss": 0.1895, + "step": 15200 + }, + { + "epoch": 0.243216, + "grad_norm": 1.0, + "learning_rate": 7.629032258064517e-05, + "loss": 0.19, + "step": 15201 + }, + { + "epoch": 0.243232, + "grad_norm": 0.5859375, + "learning_rate": 7.628870967741935e-05, + "loss": 0.1873, + "step": 15202 + }, + { + "epoch": 0.243248, + "grad_norm": 0.7734375, + "learning_rate": 7.628709677419355e-05, + "loss": 0.1532, + "step": 15203 + }, + { + "epoch": 0.243264, + "grad_norm": 0.9140625, + "learning_rate": 7.628548387096774e-05, + "loss": 0.1279, + "step": 15204 + }, + { + "epoch": 0.24328, + "grad_norm": 1.1953125, + "learning_rate": 7.628387096774194e-05, + "loss": 0.1572, + "step": 15205 + }, + { + "epoch": 0.243296, + "grad_norm": 0.8515625, + "learning_rate": 7.628225806451613e-05, + "loss": 0.187, + "step": 15206 + }, + { + "epoch": 0.243312, + "grad_norm": 1.3515625, + "learning_rate": 7.628064516129033e-05, + "loss": 0.1781, + "step": 15207 + }, + { + "epoch": 0.243328, + "grad_norm": 0.89453125, + "learning_rate": 7.627903225806453e-05, + "loss": 0.1806, + "step": 15208 + }, + { + "epoch": 0.243344, + "grad_norm": 0.82421875, + "learning_rate": 7.627741935483871e-05, + "loss": 0.1712, + "step": 15209 + }, + { + "epoch": 0.24336, + "grad_norm": 1.0234375, + "learning_rate": 7.627580645161291e-05, + "loss": 0.1815, + "step": 15210 + }, + { + "epoch": 0.243376, + "grad_norm": 0.7578125, + "learning_rate": 7.62741935483871e-05, + "loss": 0.1779, + "step": 15211 + }, + { + "epoch": 0.243392, + "grad_norm": 0.5078125, + "learning_rate": 7.62725806451613e-05, + "loss": 0.151, + "step": 15212 + }, + { + "epoch": 0.243408, + "grad_norm": 0.703125, + "learning_rate": 7.627096774193548e-05, + "loss": 0.1624, + "step": 15213 + }, + { + "epoch": 0.243424, + "grad_norm": 0.95703125, + "learning_rate": 7.626935483870968e-05, + "loss": 0.1825, + "step": 15214 + }, + { + "epoch": 0.24344, + "grad_norm": 0.734375, + "learning_rate": 7.626774193548387e-05, + "loss": 0.2002, + "step": 15215 + }, + { + "epoch": 0.243456, + "grad_norm": 0.671875, + "learning_rate": 7.626612903225807e-05, + "loss": 0.1764, + "step": 15216 + }, + { + "epoch": 0.243472, + "grad_norm": 0.7578125, + "learning_rate": 7.626451612903225e-05, + "loss": 0.1616, + "step": 15217 + }, + { + "epoch": 0.243488, + "grad_norm": 0.81640625, + "learning_rate": 7.626290322580645e-05, + "loss": 0.1611, + "step": 15218 + }, + { + "epoch": 0.243504, + "grad_norm": 1.0390625, + "learning_rate": 7.626129032258065e-05, + "loss": 0.1299, + "step": 15219 + }, + { + "epoch": 0.24352, + "grad_norm": 0.5390625, + "learning_rate": 7.625967741935485e-05, + "loss": 0.148, + "step": 15220 + }, + { + "epoch": 0.243536, + "grad_norm": 0.7109375, + "learning_rate": 7.625806451612904e-05, + "loss": 0.1733, + "step": 15221 + }, + { + "epoch": 0.243552, + "grad_norm": 0.625, + "learning_rate": 7.625645161290324e-05, + "loss": 0.1633, + "step": 15222 + }, + { + "epoch": 0.243568, + "grad_norm": 0.83203125, + "learning_rate": 7.625483870967742e-05, + "loss": 0.1588, + "step": 15223 + }, + { + "epoch": 0.243584, + "grad_norm": 0.80859375, + "learning_rate": 7.625322580645161e-05, + "loss": 0.164, + "step": 15224 + }, + { + "epoch": 0.2436, + "grad_norm": 0.56640625, + "learning_rate": 7.625161290322581e-05, + "loss": 0.1708, + "step": 15225 + }, + { + "epoch": 0.243616, + "grad_norm": 0.65234375, + "learning_rate": 7.625e-05, + "loss": 0.1608, + "step": 15226 + }, + { + "epoch": 0.243632, + "grad_norm": 1.4375, + "learning_rate": 7.62483870967742e-05, + "loss": 0.2141, + "step": 15227 + }, + { + "epoch": 0.243648, + "grad_norm": 0.796875, + "learning_rate": 7.624677419354838e-05, + "loss": 0.1713, + "step": 15228 + }, + { + "epoch": 0.243664, + "grad_norm": 0.96484375, + "learning_rate": 7.624516129032258e-05, + "loss": 0.1957, + "step": 15229 + }, + { + "epoch": 0.24368, + "grad_norm": 0.96484375, + "learning_rate": 7.624354838709677e-05, + "loss": 0.1669, + "step": 15230 + }, + { + "epoch": 0.243696, + "grad_norm": 0.78125, + "learning_rate": 7.624193548387097e-05, + "loss": 0.1741, + "step": 15231 + }, + { + "epoch": 0.243712, + "grad_norm": 0.51171875, + "learning_rate": 7.624032258064517e-05, + "loss": 0.1361, + "step": 15232 + }, + { + "epoch": 0.243728, + "grad_norm": 0.78125, + "learning_rate": 7.623870967741937e-05, + "loss": 0.1952, + "step": 15233 + }, + { + "epoch": 0.243744, + "grad_norm": 0.921875, + "learning_rate": 7.623709677419355e-05, + "loss": 0.1701, + "step": 15234 + }, + { + "epoch": 0.24376, + "grad_norm": 0.734375, + "learning_rate": 7.623548387096775e-05, + "loss": 0.154, + "step": 15235 + }, + { + "epoch": 0.243776, + "grad_norm": 1.0546875, + "learning_rate": 7.623387096774194e-05, + "loss": 0.1701, + "step": 15236 + }, + { + "epoch": 0.243792, + "grad_norm": 0.65234375, + "learning_rate": 7.623225806451614e-05, + "loss": 0.2209, + "step": 15237 + }, + { + "epoch": 0.243808, + "grad_norm": 0.71875, + "learning_rate": 7.623064516129032e-05, + "loss": 0.165, + "step": 15238 + }, + { + "epoch": 0.243824, + "grad_norm": 0.87109375, + "learning_rate": 7.622903225806451e-05, + "loss": 0.1606, + "step": 15239 + }, + { + "epoch": 0.24384, + "grad_norm": 0.96484375, + "learning_rate": 7.622741935483871e-05, + "loss": 0.1867, + "step": 15240 + }, + { + "epoch": 0.243856, + "grad_norm": 0.58203125, + "learning_rate": 7.62258064516129e-05, + "loss": 0.2108, + "step": 15241 + }, + { + "epoch": 0.243872, + "grad_norm": 0.69140625, + "learning_rate": 7.62241935483871e-05, + "loss": 0.1539, + "step": 15242 + }, + { + "epoch": 0.243888, + "grad_norm": 0.8984375, + "learning_rate": 7.62225806451613e-05, + "loss": 0.1897, + "step": 15243 + }, + { + "epoch": 0.243904, + "grad_norm": 0.74609375, + "learning_rate": 7.62209677419355e-05, + "loss": 0.1453, + "step": 15244 + }, + { + "epoch": 0.24392, + "grad_norm": 0.7109375, + "learning_rate": 7.621935483870968e-05, + "loss": 0.1576, + "step": 15245 + }, + { + "epoch": 0.243936, + "grad_norm": 0.79296875, + "learning_rate": 7.621774193548388e-05, + "loss": 0.1661, + "step": 15246 + }, + { + "epoch": 0.243952, + "grad_norm": 0.74609375, + "learning_rate": 7.621612903225807e-05, + "loss": 0.1838, + "step": 15247 + }, + { + "epoch": 0.243968, + "grad_norm": 0.828125, + "learning_rate": 7.621451612903227e-05, + "loss": 0.2185, + "step": 15248 + }, + { + "epoch": 0.243984, + "grad_norm": 1.0859375, + "learning_rate": 7.621290322580645e-05, + "loss": 0.1899, + "step": 15249 + }, + { + "epoch": 0.244, + "grad_norm": 0.65234375, + "learning_rate": 7.621129032258065e-05, + "loss": 0.1664, + "step": 15250 + }, + { + "epoch": 0.244016, + "grad_norm": 0.69921875, + "learning_rate": 7.620967741935484e-05, + "loss": 0.1435, + "step": 15251 + }, + { + "epoch": 0.244032, + "grad_norm": 1.0703125, + "learning_rate": 7.620806451612904e-05, + "loss": 0.2158, + "step": 15252 + }, + { + "epoch": 0.244048, + "grad_norm": 0.65625, + "learning_rate": 7.620645161290322e-05, + "loss": 0.1743, + "step": 15253 + }, + { + "epoch": 0.244064, + "grad_norm": 0.7578125, + "learning_rate": 7.620483870967742e-05, + "loss": 0.1745, + "step": 15254 + }, + { + "epoch": 0.24408, + "grad_norm": 0.76171875, + "learning_rate": 7.620322580645162e-05, + "loss": 0.1892, + "step": 15255 + }, + { + "epoch": 0.244096, + "grad_norm": 0.84765625, + "learning_rate": 7.620161290322581e-05, + "loss": 0.1224, + "step": 15256 + }, + { + "epoch": 0.244112, + "grad_norm": 0.69921875, + "learning_rate": 7.620000000000001e-05, + "loss": 0.1848, + "step": 15257 + }, + { + "epoch": 0.244128, + "grad_norm": 0.66015625, + "learning_rate": 7.61983870967742e-05, + "loss": 0.1736, + "step": 15258 + }, + { + "epoch": 0.244144, + "grad_norm": 0.82421875, + "learning_rate": 7.61967741935484e-05, + "loss": 0.1579, + "step": 15259 + }, + { + "epoch": 0.24416, + "grad_norm": 0.68359375, + "learning_rate": 7.619516129032258e-05, + "loss": 0.1319, + "step": 15260 + }, + { + "epoch": 0.244176, + "grad_norm": 0.95703125, + "learning_rate": 7.619354838709678e-05, + "loss": 0.1857, + "step": 15261 + }, + { + "epoch": 0.244192, + "grad_norm": 0.5859375, + "learning_rate": 7.619193548387097e-05, + "loss": 0.1715, + "step": 15262 + }, + { + "epoch": 0.244208, + "grad_norm": 0.78515625, + "learning_rate": 7.619032258064517e-05, + "loss": 0.1808, + "step": 15263 + }, + { + "epoch": 0.244224, + "grad_norm": 1.3046875, + "learning_rate": 7.618870967741935e-05, + "loss": 0.1938, + "step": 15264 + }, + { + "epoch": 0.24424, + "grad_norm": 1.5078125, + "learning_rate": 7.618709677419355e-05, + "loss": 0.1698, + "step": 15265 + }, + { + "epoch": 0.244256, + "grad_norm": 1.1015625, + "learning_rate": 7.618548387096774e-05, + "loss": 0.2052, + "step": 15266 + }, + { + "epoch": 0.244272, + "grad_norm": 0.80078125, + "learning_rate": 7.618387096774194e-05, + "loss": 0.1568, + "step": 15267 + }, + { + "epoch": 0.244288, + "grad_norm": 0.6484375, + "learning_rate": 7.618225806451614e-05, + "loss": 0.1619, + "step": 15268 + }, + { + "epoch": 0.244304, + "grad_norm": 1.0390625, + "learning_rate": 7.618064516129034e-05, + "loss": 0.2027, + "step": 15269 + }, + { + "epoch": 0.24432, + "grad_norm": 0.62109375, + "learning_rate": 7.617903225806452e-05, + "loss": 0.137, + "step": 15270 + }, + { + "epoch": 0.244336, + "grad_norm": 0.75, + "learning_rate": 7.617741935483871e-05, + "loss": 0.2011, + "step": 15271 + }, + { + "epoch": 0.244352, + "grad_norm": 0.65625, + "learning_rate": 7.617580645161291e-05, + "loss": 0.1637, + "step": 15272 + }, + { + "epoch": 0.244368, + "grad_norm": 1.4140625, + "learning_rate": 7.61741935483871e-05, + "loss": 0.1895, + "step": 15273 + }, + { + "epoch": 0.244384, + "grad_norm": 0.8125, + "learning_rate": 7.61725806451613e-05, + "loss": 0.1743, + "step": 15274 + }, + { + "epoch": 0.2444, + "grad_norm": 0.7109375, + "learning_rate": 7.617096774193548e-05, + "loss": 0.1295, + "step": 15275 + }, + { + "epoch": 0.244416, + "grad_norm": 1.8359375, + "learning_rate": 7.616935483870968e-05, + "loss": 0.17, + "step": 15276 + }, + { + "epoch": 0.244432, + "grad_norm": 1.703125, + "learning_rate": 7.616774193548387e-05, + "loss": 0.1978, + "step": 15277 + }, + { + "epoch": 0.244448, + "grad_norm": 0.53125, + "learning_rate": 7.616612903225807e-05, + "loss": 0.1391, + "step": 15278 + }, + { + "epoch": 0.244464, + "grad_norm": 1.25, + "learning_rate": 7.616451612903227e-05, + "loss": 0.2166, + "step": 15279 + }, + { + "epoch": 0.24448, + "grad_norm": 0.80078125, + "learning_rate": 7.616290322580647e-05, + "loss": 0.1749, + "step": 15280 + }, + { + "epoch": 0.244496, + "grad_norm": 1.0859375, + "learning_rate": 7.616129032258065e-05, + "loss": 0.1865, + "step": 15281 + }, + { + "epoch": 0.244512, + "grad_norm": 1.0, + "learning_rate": 7.615967741935485e-05, + "loss": 0.1575, + "step": 15282 + }, + { + "epoch": 0.244528, + "grad_norm": 0.6875, + "learning_rate": 7.615806451612904e-05, + "loss": 0.1685, + "step": 15283 + }, + { + "epoch": 0.244544, + "grad_norm": 1.40625, + "learning_rate": 7.615645161290324e-05, + "loss": 0.1733, + "step": 15284 + }, + { + "epoch": 0.24456, + "grad_norm": 0.9140625, + "learning_rate": 7.615483870967742e-05, + "loss": 0.1575, + "step": 15285 + }, + { + "epoch": 0.244576, + "grad_norm": 0.8125, + "learning_rate": 7.615322580645161e-05, + "loss": 0.1638, + "step": 15286 + }, + { + "epoch": 0.244592, + "grad_norm": 0.60546875, + "learning_rate": 7.615161290322581e-05, + "loss": 0.1573, + "step": 15287 + }, + { + "epoch": 0.244608, + "grad_norm": 0.9296875, + "learning_rate": 7.615e-05, + "loss": 0.2123, + "step": 15288 + }, + { + "epoch": 0.244624, + "grad_norm": 0.89453125, + "learning_rate": 7.61483870967742e-05, + "loss": 0.1725, + "step": 15289 + }, + { + "epoch": 0.24464, + "grad_norm": 0.703125, + "learning_rate": 7.61467741935484e-05, + "loss": 0.1831, + "step": 15290 + }, + { + "epoch": 0.244656, + "grad_norm": 0.62890625, + "learning_rate": 7.614516129032258e-05, + "loss": 0.1765, + "step": 15291 + }, + { + "epoch": 0.244672, + "grad_norm": 1.0859375, + "learning_rate": 7.614354838709678e-05, + "loss": 0.1365, + "step": 15292 + }, + { + "epoch": 0.244688, + "grad_norm": 1.0390625, + "learning_rate": 7.614193548387098e-05, + "loss": 0.1225, + "step": 15293 + }, + { + "epoch": 0.244704, + "grad_norm": 0.59765625, + "learning_rate": 7.614032258064516e-05, + "loss": 0.1673, + "step": 15294 + }, + { + "epoch": 0.24472, + "grad_norm": 1.0, + "learning_rate": 7.613870967741936e-05, + "loss": 0.2106, + "step": 15295 + }, + { + "epoch": 0.244736, + "grad_norm": 1.3125, + "learning_rate": 7.613709677419355e-05, + "loss": 0.2084, + "step": 15296 + }, + { + "epoch": 0.244752, + "grad_norm": 0.859375, + "learning_rate": 7.613548387096775e-05, + "loss": 0.2426, + "step": 15297 + }, + { + "epoch": 0.244768, + "grad_norm": 0.5390625, + "learning_rate": 7.613387096774194e-05, + "loss": 0.1566, + "step": 15298 + }, + { + "epoch": 0.244784, + "grad_norm": 0.87890625, + "learning_rate": 7.613225806451614e-05, + "loss": 0.1583, + "step": 15299 + }, + { + "epoch": 0.2448, + "grad_norm": 0.8359375, + "learning_rate": 7.613064516129032e-05, + "loss": 0.1644, + "step": 15300 + }, + { + "epoch": 0.244816, + "grad_norm": 0.6015625, + "learning_rate": 7.612903225806451e-05, + "loss": 0.1364, + "step": 15301 + }, + { + "epoch": 0.244832, + "grad_norm": 1.0625, + "learning_rate": 7.612741935483871e-05, + "loss": 0.1467, + "step": 15302 + }, + { + "epoch": 0.244848, + "grad_norm": 1.0390625, + "learning_rate": 7.612580645161291e-05, + "loss": 0.2061, + "step": 15303 + }, + { + "epoch": 0.244864, + "grad_norm": 0.78515625, + "learning_rate": 7.612419354838711e-05, + "loss": 0.1732, + "step": 15304 + }, + { + "epoch": 0.24488, + "grad_norm": 0.82421875, + "learning_rate": 7.612258064516129e-05, + "loss": 0.1855, + "step": 15305 + }, + { + "epoch": 0.244896, + "grad_norm": 0.7109375, + "learning_rate": 7.612096774193549e-05, + "loss": 0.1613, + "step": 15306 + }, + { + "epoch": 0.244912, + "grad_norm": 0.78125, + "learning_rate": 7.611935483870968e-05, + "loss": 0.1856, + "step": 15307 + }, + { + "epoch": 0.244928, + "grad_norm": 0.6171875, + "learning_rate": 7.611774193548388e-05, + "loss": 0.1785, + "step": 15308 + }, + { + "epoch": 0.244944, + "grad_norm": 1.5546875, + "learning_rate": 7.611612903225806e-05, + "loss": 0.2328, + "step": 15309 + }, + { + "epoch": 0.24496, + "grad_norm": 1.0625, + "learning_rate": 7.611451612903226e-05, + "loss": 0.1805, + "step": 15310 + }, + { + "epoch": 0.244976, + "grad_norm": 1.09375, + "learning_rate": 7.611290322580645e-05, + "loss": 0.1492, + "step": 15311 + }, + { + "epoch": 0.244992, + "grad_norm": 0.58984375, + "learning_rate": 7.611129032258065e-05, + "loss": 0.1665, + "step": 15312 + }, + { + "epoch": 0.245008, + "grad_norm": 1.09375, + "learning_rate": 7.610967741935484e-05, + "loss": 0.1888, + "step": 15313 + }, + { + "epoch": 0.245024, + "grad_norm": 0.65625, + "learning_rate": 7.610806451612904e-05, + "loss": 0.1846, + "step": 15314 + }, + { + "epoch": 0.24504, + "grad_norm": 0.515625, + "learning_rate": 7.610645161290324e-05, + "loss": 0.1601, + "step": 15315 + }, + { + "epoch": 0.245056, + "grad_norm": 0.98828125, + "learning_rate": 7.610483870967743e-05, + "loss": 0.1635, + "step": 15316 + }, + { + "epoch": 0.245072, + "grad_norm": 0.70703125, + "learning_rate": 7.610322580645162e-05, + "loss": 0.1887, + "step": 15317 + }, + { + "epoch": 0.245088, + "grad_norm": 0.84765625, + "learning_rate": 7.610161290322581e-05, + "loss": 0.1721, + "step": 15318 + }, + { + "epoch": 0.245104, + "grad_norm": 0.455078125, + "learning_rate": 7.61e-05, + "loss": 0.1694, + "step": 15319 + }, + { + "epoch": 0.24512, + "grad_norm": 1.0, + "learning_rate": 7.609838709677419e-05, + "loss": 0.191, + "step": 15320 + }, + { + "epoch": 0.245136, + "grad_norm": 0.62890625, + "learning_rate": 7.609677419354839e-05, + "loss": 0.16, + "step": 15321 + }, + { + "epoch": 0.245152, + "grad_norm": 1.2265625, + "learning_rate": 7.609516129032258e-05, + "loss": 0.188, + "step": 15322 + }, + { + "epoch": 0.245168, + "grad_norm": 0.609375, + "learning_rate": 7.609354838709678e-05, + "loss": 0.1673, + "step": 15323 + }, + { + "epoch": 0.245184, + "grad_norm": 1.1171875, + "learning_rate": 7.609193548387096e-05, + "loss": 0.1634, + "step": 15324 + }, + { + "epoch": 0.2452, + "grad_norm": 0.8359375, + "learning_rate": 7.609032258064516e-05, + "loss": 0.1631, + "step": 15325 + }, + { + "epoch": 0.245216, + "grad_norm": 0.6796875, + "learning_rate": 7.608870967741935e-05, + "loss": 0.1809, + "step": 15326 + }, + { + "epoch": 0.245232, + "grad_norm": 0.65625, + "learning_rate": 7.608709677419355e-05, + "loss": 0.1285, + "step": 15327 + }, + { + "epoch": 0.245248, + "grad_norm": 1.109375, + "learning_rate": 7.608548387096775e-05, + "loss": 0.158, + "step": 15328 + }, + { + "epoch": 0.245264, + "grad_norm": 0.609375, + "learning_rate": 7.608387096774195e-05, + "loss": 0.1482, + "step": 15329 + }, + { + "epoch": 0.24528, + "grad_norm": 0.75390625, + "learning_rate": 7.608225806451613e-05, + "loss": 0.1561, + "step": 15330 + }, + { + "epoch": 0.245296, + "grad_norm": 0.6484375, + "learning_rate": 7.608064516129033e-05, + "loss": 0.1692, + "step": 15331 + }, + { + "epoch": 0.245312, + "grad_norm": 1.109375, + "learning_rate": 7.607903225806452e-05, + "loss": 0.1424, + "step": 15332 + }, + { + "epoch": 0.245328, + "grad_norm": 0.59765625, + "learning_rate": 7.60774193548387e-05, + "loss": 0.1636, + "step": 15333 + }, + { + "epoch": 0.245344, + "grad_norm": 1.109375, + "learning_rate": 7.60758064516129e-05, + "loss": 0.1671, + "step": 15334 + }, + { + "epoch": 0.24536, + "grad_norm": 0.73828125, + "learning_rate": 7.607419354838709e-05, + "loss": 0.1572, + "step": 15335 + }, + { + "epoch": 0.245376, + "grad_norm": 0.455078125, + "learning_rate": 7.607258064516129e-05, + "loss": 0.1382, + "step": 15336 + }, + { + "epoch": 0.245392, + "grad_norm": 0.95703125, + "learning_rate": 7.607096774193548e-05, + "loss": 0.1336, + "step": 15337 + }, + { + "epoch": 0.245408, + "grad_norm": 0.75390625, + "learning_rate": 7.606935483870968e-05, + "loss": 0.1912, + "step": 15338 + }, + { + "epoch": 0.245424, + "grad_norm": 0.890625, + "learning_rate": 7.606774193548388e-05, + "loss": 0.1745, + "step": 15339 + }, + { + "epoch": 0.24544, + "grad_norm": 0.9453125, + "learning_rate": 7.606612903225808e-05, + "loss": 0.1924, + "step": 15340 + }, + { + "epoch": 0.245456, + "grad_norm": 1.0703125, + "learning_rate": 7.606451612903226e-05, + "loss": 0.201, + "step": 15341 + }, + { + "epoch": 0.245472, + "grad_norm": 0.74609375, + "learning_rate": 7.606290322580646e-05, + "loss": 0.1552, + "step": 15342 + }, + { + "epoch": 0.245488, + "grad_norm": 0.67578125, + "learning_rate": 7.606129032258065e-05, + "loss": 0.2033, + "step": 15343 + }, + { + "epoch": 0.245504, + "grad_norm": 0.7109375, + "learning_rate": 7.605967741935485e-05, + "loss": 0.1758, + "step": 15344 + }, + { + "epoch": 0.24552, + "grad_norm": 0.75390625, + "learning_rate": 7.605806451612903e-05, + "loss": 0.154, + "step": 15345 + }, + { + "epoch": 0.245536, + "grad_norm": 0.7578125, + "learning_rate": 7.605645161290323e-05, + "loss": 0.1474, + "step": 15346 + }, + { + "epoch": 0.245552, + "grad_norm": 1.0703125, + "learning_rate": 7.605483870967742e-05, + "loss": 0.1948, + "step": 15347 + }, + { + "epoch": 0.245568, + "grad_norm": 0.54296875, + "learning_rate": 7.60532258064516e-05, + "loss": 0.1545, + "step": 15348 + }, + { + "epoch": 0.245584, + "grad_norm": 0.5625, + "learning_rate": 7.60516129032258e-05, + "loss": 0.1423, + "step": 15349 + }, + { + "epoch": 0.2456, + "grad_norm": 0.625, + "learning_rate": 7.605e-05, + "loss": 0.1192, + "step": 15350 + }, + { + "epoch": 0.245616, + "grad_norm": 0.76171875, + "learning_rate": 7.60483870967742e-05, + "loss": 0.1834, + "step": 15351 + }, + { + "epoch": 0.245632, + "grad_norm": 0.5703125, + "learning_rate": 7.604677419354839e-05, + "loss": 0.1639, + "step": 15352 + }, + { + "epoch": 0.245648, + "grad_norm": 1.7421875, + "learning_rate": 7.604516129032259e-05, + "loss": 0.1868, + "step": 15353 + }, + { + "epoch": 0.245664, + "grad_norm": 0.75390625, + "learning_rate": 7.604354838709678e-05, + "loss": 0.1596, + "step": 15354 + }, + { + "epoch": 0.24568, + "grad_norm": 0.92578125, + "learning_rate": 7.604193548387098e-05, + "loss": 0.1634, + "step": 15355 + }, + { + "epoch": 0.245696, + "grad_norm": 0.7578125, + "learning_rate": 7.604032258064516e-05, + "loss": 0.1865, + "step": 15356 + }, + { + "epoch": 0.245712, + "grad_norm": 0.57421875, + "learning_rate": 7.603870967741936e-05, + "loss": 0.1673, + "step": 15357 + }, + { + "epoch": 0.245728, + "grad_norm": 1.0625, + "learning_rate": 7.603709677419355e-05, + "loss": 0.1948, + "step": 15358 + }, + { + "epoch": 0.245744, + "grad_norm": 1.2265625, + "learning_rate": 7.603548387096775e-05, + "loss": 0.1891, + "step": 15359 + }, + { + "epoch": 0.24576, + "grad_norm": 0.6953125, + "learning_rate": 7.603387096774193e-05, + "loss": 0.1653, + "step": 15360 + }, + { + "epoch": 0.245776, + "grad_norm": 0.734375, + "learning_rate": 7.603225806451613e-05, + "loss": 0.1975, + "step": 15361 + }, + { + "epoch": 0.245792, + "grad_norm": 0.71484375, + "learning_rate": 7.603064516129032e-05, + "loss": 0.1376, + "step": 15362 + }, + { + "epoch": 0.245808, + "grad_norm": 1.2109375, + "learning_rate": 7.602903225806452e-05, + "loss": 0.1973, + "step": 15363 + }, + { + "epoch": 0.245824, + "grad_norm": 0.734375, + "learning_rate": 7.602741935483872e-05, + "loss": 0.1355, + "step": 15364 + }, + { + "epoch": 0.24584, + "grad_norm": 1.03125, + "learning_rate": 7.60258064516129e-05, + "loss": 0.1659, + "step": 15365 + }, + { + "epoch": 0.245856, + "grad_norm": 0.6953125, + "learning_rate": 7.60241935483871e-05, + "loss": 0.1784, + "step": 15366 + }, + { + "epoch": 0.245872, + "grad_norm": 1.1953125, + "learning_rate": 7.602258064516129e-05, + "loss": 0.1284, + "step": 15367 + }, + { + "epoch": 0.245888, + "grad_norm": 1.5546875, + "learning_rate": 7.602096774193549e-05, + "loss": 0.1783, + "step": 15368 + }, + { + "epoch": 0.245904, + "grad_norm": 0.6015625, + "learning_rate": 7.601935483870968e-05, + "loss": 0.1501, + "step": 15369 + }, + { + "epoch": 0.24592, + "grad_norm": 0.890625, + "learning_rate": 7.601774193548388e-05, + "loss": 0.1572, + "step": 15370 + }, + { + "epoch": 0.245936, + "grad_norm": 0.9453125, + "learning_rate": 7.601612903225806e-05, + "loss": 0.1768, + "step": 15371 + }, + { + "epoch": 0.245952, + "grad_norm": 0.92578125, + "learning_rate": 7.601451612903226e-05, + "loss": 0.1487, + "step": 15372 + }, + { + "epoch": 0.245968, + "grad_norm": 0.6015625, + "learning_rate": 7.601290322580645e-05, + "loss": 0.1582, + "step": 15373 + }, + { + "epoch": 0.245984, + "grad_norm": 0.609375, + "learning_rate": 7.601129032258065e-05, + "loss": 0.1629, + "step": 15374 + }, + { + "epoch": 0.246, + "grad_norm": 0.7265625, + "learning_rate": 7.600967741935485e-05, + "loss": 0.205, + "step": 15375 + }, + { + "epoch": 0.246016, + "grad_norm": 0.859375, + "learning_rate": 7.600806451612905e-05, + "loss": 0.1647, + "step": 15376 + }, + { + "epoch": 0.246032, + "grad_norm": 0.65625, + "learning_rate": 7.600645161290323e-05, + "loss": 0.1456, + "step": 15377 + }, + { + "epoch": 0.246048, + "grad_norm": 0.98828125, + "learning_rate": 7.600483870967743e-05, + "loss": 0.1671, + "step": 15378 + }, + { + "epoch": 0.246064, + "grad_norm": 0.87890625, + "learning_rate": 7.600322580645162e-05, + "loss": 0.1544, + "step": 15379 + }, + { + "epoch": 0.24608, + "grad_norm": 0.828125, + "learning_rate": 7.60016129032258e-05, + "loss": 0.1662, + "step": 15380 + }, + { + "epoch": 0.246096, + "grad_norm": 0.9453125, + "learning_rate": 7.6e-05, + "loss": 0.1855, + "step": 15381 + }, + { + "epoch": 0.246112, + "grad_norm": 0.76953125, + "learning_rate": 7.599838709677419e-05, + "loss": 0.1717, + "step": 15382 + }, + { + "epoch": 0.246128, + "grad_norm": 0.54296875, + "learning_rate": 7.599677419354839e-05, + "loss": 0.1707, + "step": 15383 + }, + { + "epoch": 0.246144, + "grad_norm": 0.7734375, + "learning_rate": 7.599516129032258e-05, + "loss": 0.1895, + "step": 15384 + }, + { + "epoch": 0.24616, + "grad_norm": 0.92578125, + "learning_rate": 7.599354838709678e-05, + "loss": 0.2248, + "step": 15385 + }, + { + "epoch": 0.246176, + "grad_norm": 0.80859375, + "learning_rate": 7.599193548387098e-05, + "loss": 0.2218, + "step": 15386 + }, + { + "epoch": 0.246192, + "grad_norm": 0.796875, + "learning_rate": 7.599032258064516e-05, + "loss": 0.1471, + "step": 15387 + }, + { + "epoch": 0.246208, + "grad_norm": 0.8984375, + "learning_rate": 7.598870967741936e-05, + "loss": 0.133, + "step": 15388 + }, + { + "epoch": 0.246224, + "grad_norm": 0.87109375, + "learning_rate": 7.598709677419356e-05, + "loss": 0.1381, + "step": 15389 + }, + { + "epoch": 0.24624, + "grad_norm": 0.55859375, + "learning_rate": 7.598548387096775e-05, + "loss": 0.1618, + "step": 15390 + }, + { + "epoch": 0.246256, + "grad_norm": 0.6875, + "learning_rate": 7.598387096774195e-05, + "loss": 0.1622, + "step": 15391 + }, + { + "epoch": 0.246272, + "grad_norm": 0.53515625, + "learning_rate": 7.598225806451613e-05, + "loss": 0.1751, + "step": 15392 + }, + { + "epoch": 0.246288, + "grad_norm": 0.6328125, + "learning_rate": 7.598064516129033e-05, + "loss": 0.1571, + "step": 15393 + }, + { + "epoch": 0.246304, + "grad_norm": 1.015625, + "learning_rate": 7.597903225806452e-05, + "loss": 0.1821, + "step": 15394 + }, + { + "epoch": 0.24632, + "grad_norm": 0.9453125, + "learning_rate": 7.59774193548387e-05, + "loss": 0.2013, + "step": 15395 + }, + { + "epoch": 0.246336, + "grad_norm": 0.703125, + "learning_rate": 7.59758064516129e-05, + "loss": 0.1484, + "step": 15396 + }, + { + "epoch": 0.246352, + "grad_norm": 0.84375, + "learning_rate": 7.597419354838709e-05, + "loss": 0.2059, + "step": 15397 + }, + { + "epoch": 0.246368, + "grad_norm": 0.63671875, + "learning_rate": 7.597258064516129e-05, + "loss": 0.165, + "step": 15398 + }, + { + "epoch": 0.246384, + "grad_norm": 0.73046875, + "learning_rate": 7.597096774193549e-05, + "loss": 0.1951, + "step": 15399 + }, + { + "epoch": 0.2464, + "grad_norm": 1.1328125, + "learning_rate": 7.596935483870969e-05, + "loss": 0.1651, + "step": 15400 + }, + { + "epoch": 0.246416, + "grad_norm": 0.65234375, + "learning_rate": 7.596774193548387e-05, + "loss": 0.1955, + "step": 15401 + }, + { + "epoch": 0.246432, + "grad_norm": 1.109375, + "learning_rate": 7.596612903225807e-05, + "loss": 0.1725, + "step": 15402 + }, + { + "epoch": 0.246448, + "grad_norm": 0.80859375, + "learning_rate": 7.596451612903226e-05, + "loss": 0.1962, + "step": 15403 + }, + { + "epoch": 0.246464, + "grad_norm": 0.625, + "learning_rate": 7.596290322580646e-05, + "loss": 0.1739, + "step": 15404 + }, + { + "epoch": 0.24648, + "grad_norm": 0.796875, + "learning_rate": 7.596129032258065e-05, + "loss": 0.2024, + "step": 15405 + }, + { + "epoch": 0.246496, + "grad_norm": 1.1484375, + "learning_rate": 7.595967741935485e-05, + "loss": 0.1967, + "step": 15406 + }, + { + "epoch": 0.246512, + "grad_norm": 0.55078125, + "learning_rate": 7.595806451612903e-05, + "loss": 0.1294, + "step": 15407 + }, + { + "epoch": 0.246528, + "grad_norm": 1.3203125, + "learning_rate": 7.595645161290323e-05, + "loss": 0.1555, + "step": 15408 + }, + { + "epoch": 0.246544, + "grad_norm": 0.80078125, + "learning_rate": 7.595483870967742e-05, + "loss": 0.1373, + "step": 15409 + }, + { + "epoch": 0.24656, + "grad_norm": 0.6640625, + "learning_rate": 7.595322580645162e-05, + "loss": 0.1843, + "step": 15410 + }, + { + "epoch": 0.246576, + "grad_norm": 1.0390625, + "learning_rate": 7.595161290322582e-05, + "loss": 0.2012, + "step": 15411 + }, + { + "epoch": 0.246592, + "grad_norm": 0.9609375, + "learning_rate": 7.595e-05, + "loss": 0.1639, + "step": 15412 + }, + { + "epoch": 0.246608, + "grad_norm": 0.9296875, + "learning_rate": 7.59483870967742e-05, + "loss": 0.1533, + "step": 15413 + }, + { + "epoch": 0.246624, + "grad_norm": 0.90625, + "learning_rate": 7.594677419354839e-05, + "loss": 0.1819, + "step": 15414 + }, + { + "epoch": 0.24664, + "grad_norm": 0.80859375, + "learning_rate": 7.594516129032259e-05, + "loss": 0.1921, + "step": 15415 + }, + { + "epoch": 0.246656, + "grad_norm": 0.76953125, + "learning_rate": 7.594354838709677e-05, + "loss": 0.1434, + "step": 15416 + }, + { + "epoch": 0.246672, + "grad_norm": 1.25, + "learning_rate": 7.594193548387097e-05, + "loss": 0.1647, + "step": 15417 + }, + { + "epoch": 0.246688, + "grad_norm": 0.86328125, + "learning_rate": 7.594032258064516e-05, + "loss": 0.1647, + "step": 15418 + }, + { + "epoch": 0.246704, + "grad_norm": 0.640625, + "learning_rate": 7.593870967741936e-05, + "loss": 0.1694, + "step": 15419 + }, + { + "epoch": 0.24672, + "grad_norm": 0.62109375, + "learning_rate": 7.593709677419355e-05, + "loss": 0.1356, + "step": 15420 + }, + { + "epoch": 0.246736, + "grad_norm": 0.6640625, + "learning_rate": 7.593548387096775e-05, + "loss": 0.1509, + "step": 15421 + }, + { + "epoch": 0.246752, + "grad_norm": 1.0859375, + "learning_rate": 7.593387096774193e-05, + "loss": 0.1771, + "step": 15422 + }, + { + "epoch": 0.246768, + "grad_norm": 0.5625, + "learning_rate": 7.593225806451613e-05, + "loss": 0.1893, + "step": 15423 + }, + { + "epoch": 0.246784, + "grad_norm": 0.55078125, + "learning_rate": 7.593064516129033e-05, + "loss": 0.1391, + "step": 15424 + }, + { + "epoch": 0.2468, + "grad_norm": 0.6640625, + "learning_rate": 7.592903225806453e-05, + "loss": 0.1636, + "step": 15425 + }, + { + "epoch": 0.246816, + "grad_norm": 1.1875, + "learning_rate": 7.592741935483872e-05, + "loss": 0.2029, + "step": 15426 + }, + { + "epoch": 0.246832, + "grad_norm": 0.78515625, + "learning_rate": 7.59258064516129e-05, + "loss": 0.1697, + "step": 15427 + }, + { + "epoch": 0.246848, + "grad_norm": 0.6875, + "learning_rate": 7.59241935483871e-05, + "loss": 0.1497, + "step": 15428 + }, + { + "epoch": 0.246864, + "grad_norm": 0.9375, + "learning_rate": 7.592258064516129e-05, + "loss": 0.1606, + "step": 15429 + }, + { + "epoch": 0.24688, + "grad_norm": 0.578125, + "learning_rate": 7.592096774193549e-05, + "loss": 0.1971, + "step": 15430 + }, + { + "epoch": 0.246896, + "grad_norm": 0.62890625, + "learning_rate": 7.591935483870967e-05, + "loss": 0.129, + "step": 15431 + }, + { + "epoch": 0.246912, + "grad_norm": 0.625, + "learning_rate": 7.591774193548387e-05, + "loss": 0.173, + "step": 15432 + }, + { + "epoch": 0.246928, + "grad_norm": 0.5390625, + "learning_rate": 7.591612903225806e-05, + "loss": 0.1462, + "step": 15433 + }, + { + "epoch": 0.246944, + "grad_norm": 0.63671875, + "learning_rate": 7.591451612903226e-05, + "loss": 0.1588, + "step": 15434 + }, + { + "epoch": 0.24696, + "grad_norm": 0.9375, + "learning_rate": 7.591290322580646e-05, + "loss": 0.1648, + "step": 15435 + }, + { + "epoch": 0.246976, + "grad_norm": 0.75390625, + "learning_rate": 7.591129032258066e-05, + "loss": 0.1646, + "step": 15436 + }, + { + "epoch": 0.246992, + "grad_norm": 0.9375, + "learning_rate": 7.590967741935484e-05, + "loss": 0.2133, + "step": 15437 + }, + { + "epoch": 0.247008, + "grad_norm": 1.21875, + "learning_rate": 7.590806451612904e-05, + "loss": 0.1698, + "step": 15438 + }, + { + "epoch": 0.247024, + "grad_norm": 0.84765625, + "learning_rate": 7.590645161290323e-05, + "loss": 0.2047, + "step": 15439 + }, + { + "epoch": 0.24704, + "grad_norm": 1.0703125, + "learning_rate": 7.590483870967743e-05, + "loss": 0.1967, + "step": 15440 + }, + { + "epoch": 0.247056, + "grad_norm": 0.80078125, + "learning_rate": 7.590322580645162e-05, + "loss": 0.2066, + "step": 15441 + }, + { + "epoch": 0.247072, + "grad_norm": 1.0546875, + "learning_rate": 7.59016129032258e-05, + "loss": 0.2106, + "step": 15442 + }, + { + "epoch": 0.247088, + "grad_norm": 0.66796875, + "learning_rate": 7.59e-05, + "loss": 0.1532, + "step": 15443 + }, + { + "epoch": 0.247104, + "grad_norm": 1.03125, + "learning_rate": 7.589838709677419e-05, + "loss": 0.1474, + "step": 15444 + }, + { + "epoch": 0.24712, + "grad_norm": 0.66796875, + "learning_rate": 7.589677419354839e-05, + "loss": 0.1662, + "step": 15445 + }, + { + "epoch": 0.247136, + "grad_norm": 0.66015625, + "learning_rate": 7.589516129032259e-05, + "loss": 0.1709, + "step": 15446 + }, + { + "epoch": 0.247152, + "grad_norm": 0.6875, + "learning_rate": 7.589354838709679e-05, + "loss": 0.1873, + "step": 15447 + }, + { + "epoch": 0.247168, + "grad_norm": 0.75, + "learning_rate": 7.589193548387097e-05, + "loss": 0.1466, + "step": 15448 + }, + { + "epoch": 0.247184, + "grad_norm": 0.52734375, + "learning_rate": 7.589032258064517e-05, + "loss": 0.1193, + "step": 15449 + }, + { + "epoch": 0.2472, + "grad_norm": 0.69921875, + "learning_rate": 7.588870967741936e-05, + "loss": 0.1778, + "step": 15450 + }, + { + "epoch": 0.247216, + "grad_norm": 0.75390625, + "learning_rate": 7.588709677419356e-05, + "loss": 0.1362, + "step": 15451 + }, + { + "epoch": 0.247232, + "grad_norm": 0.6015625, + "learning_rate": 7.588548387096774e-05, + "loss": 0.1619, + "step": 15452 + }, + { + "epoch": 0.247248, + "grad_norm": 0.78125, + "learning_rate": 7.588387096774194e-05, + "loss": 0.195, + "step": 15453 + }, + { + "epoch": 0.247264, + "grad_norm": 0.96484375, + "learning_rate": 7.588225806451613e-05, + "loss": 0.1695, + "step": 15454 + }, + { + "epoch": 0.24728, + "grad_norm": 0.78515625, + "learning_rate": 7.588064516129033e-05, + "loss": 0.1611, + "step": 15455 + }, + { + "epoch": 0.247296, + "grad_norm": 1.078125, + "learning_rate": 7.587903225806452e-05, + "loss": 0.1709, + "step": 15456 + }, + { + "epoch": 0.247312, + "grad_norm": 0.66796875, + "learning_rate": 7.58774193548387e-05, + "loss": 0.1891, + "step": 15457 + }, + { + "epoch": 0.247328, + "grad_norm": 0.99609375, + "learning_rate": 7.58758064516129e-05, + "loss": 0.2051, + "step": 15458 + }, + { + "epoch": 0.247344, + "grad_norm": 0.796875, + "learning_rate": 7.58741935483871e-05, + "loss": 0.1903, + "step": 15459 + }, + { + "epoch": 0.24736, + "grad_norm": 0.80859375, + "learning_rate": 7.58725806451613e-05, + "loss": 0.1573, + "step": 15460 + }, + { + "epoch": 0.247376, + "grad_norm": 0.79296875, + "learning_rate": 7.587096774193549e-05, + "loss": 0.1747, + "step": 15461 + }, + { + "epoch": 0.247392, + "grad_norm": 0.80859375, + "learning_rate": 7.586935483870969e-05, + "loss": 0.184, + "step": 15462 + }, + { + "epoch": 0.247408, + "grad_norm": 0.6796875, + "learning_rate": 7.586774193548387e-05, + "loss": 0.146, + "step": 15463 + }, + { + "epoch": 0.247424, + "grad_norm": 1.0078125, + "learning_rate": 7.586612903225807e-05, + "loss": 0.1849, + "step": 15464 + }, + { + "epoch": 0.24744, + "grad_norm": 0.75390625, + "learning_rate": 7.586451612903226e-05, + "loss": 0.133, + "step": 15465 + }, + { + "epoch": 0.247456, + "grad_norm": 0.75, + "learning_rate": 7.586290322580646e-05, + "loss": 0.1359, + "step": 15466 + }, + { + "epoch": 0.247472, + "grad_norm": 0.68359375, + "learning_rate": 7.586129032258064e-05, + "loss": 0.1769, + "step": 15467 + }, + { + "epoch": 0.247488, + "grad_norm": 0.7265625, + "learning_rate": 7.585967741935484e-05, + "loss": 0.1627, + "step": 15468 + }, + { + "epoch": 0.247504, + "grad_norm": 0.7109375, + "learning_rate": 7.585806451612903e-05, + "loss": 0.1635, + "step": 15469 + }, + { + "epoch": 0.24752, + "grad_norm": 1.0390625, + "learning_rate": 7.585645161290323e-05, + "loss": 0.1788, + "step": 15470 + }, + { + "epoch": 0.247536, + "grad_norm": 0.6171875, + "learning_rate": 7.585483870967743e-05, + "loss": 0.1828, + "step": 15471 + }, + { + "epoch": 0.247552, + "grad_norm": 0.765625, + "learning_rate": 7.585322580645161e-05, + "loss": 0.1339, + "step": 15472 + }, + { + "epoch": 0.247568, + "grad_norm": 1.1171875, + "learning_rate": 7.585161290322581e-05, + "loss": 0.18, + "step": 15473 + }, + { + "epoch": 0.247584, + "grad_norm": 0.8125, + "learning_rate": 7.585e-05, + "loss": 0.1708, + "step": 15474 + }, + { + "epoch": 0.2476, + "grad_norm": 0.66796875, + "learning_rate": 7.58483870967742e-05, + "loss": 0.2017, + "step": 15475 + }, + { + "epoch": 0.247616, + "grad_norm": 0.5859375, + "learning_rate": 7.584677419354839e-05, + "loss": 0.1744, + "step": 15476 + }, + { + "epoch": 0.247632, + "grad_norm": 0.79296875, + "learning_rate": 7.584516129032259e-05, + "loss": 0.1675, + "step": 15477 + }, + { + "epoch": 0.247648, + "grad_norm": 1.3125, + "learning_rate": 7.584354838709677e-05, + "loss": 0.1275, + "step": 15478 + }, + { + "epoch": 0.247664, + "grad_norm": 0.94921875, + "learning_rate": 7.584193548387097e-05, + "loss": 0.1665, + "step": 15479 + }, + { + "epoch": 0.24768, + "grad_norm": 0.7421875, + "learning_rate": 7.584032258064516e-05, + "loss": 0.1872, + "step": 15480 + }, + { + "epoch": 0.247696, + "grad_norm": 0.99609375, + "learning_rate": 7.583870967741936e-05, + "loss": 0.1734, + "step": 15481 + }, + { + "epoch": 0.247712, + "grad_norm": 0.99609375, + "learning_rate": 7.583709677419354e-05, + "loss": 0.1386, + "step": 15482 + }, + { + "epoch": 0.247728, + "grad_norm": 0.8046875, + "learning_rate": 7.583548387096774e-05, + "loss": 0.1715, + "step": 15483 + }, + { + "epoch": 0.247744, + "grad_norm": 0.859375, + "learning_rate": 7.583387096774194e-05, + "loss": 0.1575, + "step": 15484 + }, + { + "epoch": 0.24776, + "grad_norm": 0.59375, + "learning_rate": 7.583225806451614e-05, + "loss": 0.1695, + "step": 15485 + }, + { + "epoch": 0.247776, + "grad_norm": 1.515625, + "learning_rate": 7.583064516129033e-05, + "loss": 0.1985, + "step": 15486 + }, + { + "epoch": 0.247792, + "grad_norm": 0.875, + "learning_rate": 7.582903225806453e-05, + "loss": 0.2003, + "step": 15487 + }, + { + "epoch": 0.247808, + "grad_norm": 0.8671875, + "learning_rate": 7.582741935483871e-05, + "loss": 0.1464, + "step": 15488 + }, + { + "epoch": 0.247824, + "grad_norm": 1.0390625, + "learning_rate": 7.58258064516129e-05, + "loss": 0.1334, + "step": 15489 + }, + { + "epoch": 0.24784, + "grad_norm": 0.63671875, + "learning_rate": 7.58241935483871e-05, + "loss": 0.1565, + "step": 15490 + }, + { + "epoch": 0.247856, + "grad_norm": 0.78515625, + "learning_rate": 7.582258064516129e-05, + "loss": 0.184, + "step": 15491 + }, + { + "epoch": 0.247872, + "grad_norm": 0.6796875, + "learning_rate": 7.582096774193549e-05, + "loss": 0.139, + "step": 15492 + }, + { + "epoch": 0.247888, + "grad_norm": 0.64453125, + "learning_rate": 7.581935483870967e-05, + "loss": 0.1694, + "step": 15493 + }, + { + "epoch": 0.247904, + "grad_norm": 0.8359375, + "learning_rate": 7.581774193548387e-05, + "loss": 0.1793, + "step": 15494 + }, + { + "epoch": 0.24792, + "grad_norm": 1.078125, + "learning_rate": 7.581612903225807e-05, + "loss": 0.1585, + "step": 15495 + }, + { + "epoch": 0.247936, + "grad_norm": 0.84375, + "learning_rate": 7.581451612903227e-05, + "loss": 0.1715, + "step": 15496 + }, + { + "epoch": 0.247952, + "grad_norm": 0.71484375, + "learning_rate": 7.581290322580646e-05, + "loss": 0.1168, + "step": 15497 + }, + { + "epoch": 0.247968, + "grad_norm": 0.62109375, + "learning_rate": 7.581129032258066e-05, + "loss": 0.1509, + "step": 15498 + }, + { + "epoch": 0.247984, + "grad_norm": 0.671875, + "learning_rate": 7.580967741935484e-05, + "loss": 0.1574, + "step": 15499 + }, + { + "epoch": 0.248, + "grad_norm": 0.59765625, + "learning_rate": 7.580806451612904e-05, + "loss": 0.1513, + "step": 15500 + }, + { + "epoch": 0.248016, + "grad_norm": 1.2265625, + "learning_rate": 7.580645161290323e-05, + "loss": 0.1766, + "step": 15501 + }, + { + "epoch": 0.248032, + "grad_norm": 0.921875, + "learning_rate": 7.580483870967743e-05, + "loss": 0.1923, + "step": 15502 + }, + { + "epoch": 0.248048, + "grad_norm": 0.80078125, + "learning_rate": 7.580322580645161e-05, + "loss": 0.1471, + "step": 15503 + }, + { + "epoch": 0.248064, + "grad_norm": 0.9140625, + "learning_rate": 7.58016129032258e-05, + "loss": 0.1352, + "step": 15504 + }, + { + "epoch": 0.24808, + "grad_norm": 0.8125, + "learning_rate": 7.58e-05, + "loss": 0.1639, + "step": 15505 + }, + { + "epoch": 0.248096, + "grad_norm": 0.8828125, + "learning_rate": 7.57983870967742e-05, + "loss": 0.178, + "step": 15506 + }, + { + "epoch": 0.248112, + "grad_norm": 1.0390625, + "learning_rate": 7.57967741935484e-05, + "loss": 0.1569, + "step": 15507 + }, + { + "epoch": 0.248128, + "grad_norm": 0.5625, + "learning_rate": 7.579516129032258e-05, + "loss": 0.1608, + "step": 15508 + }, + { + "epoch": 0.248144, + "grad_norm": 0.66796875, + "learning_rate": 7.579354838709678e-05, + "loss": 0.1719, + "step": 15509 + }, + { + "epoch": 0.24816, + "grad_norm": 1.0234375, + "learning_rate": 7.579193548387097e-05, + "loss": 0.1764, + "step": 15510 + }, + { + "epoch": 0.248176, + "grad_norm": 0.6328125, + "learning_rate": 7.579032258064517e-05, + "loss": 0.1377, + "step": 15511 + }, + { + "epoch": 0.248192, + "grad_norm": 0.83984375, + "learning_rate": 7.578870967741936e-05, + "loss": 0.1934, + "step": 15512 + }, + { + "epoch": 0.248208, + "grad_norm": 0.8671875, + "learning_rate": 7.578709677419356e-05, + "loss": 0.1647, + "step": 15513 + }, + { + "epoch": 0.248224, + "grad_norm": 0.87890625, + "learning_rate": 7.578548387096774e-05, + "loss": 0.1418, + "step": 15514 + }, + { + "epoch": 0.24824, + "grad_norm": 0.7265625, + "learning_rate": 7.578387096774194e-05, + "loss": 0.1672, + "step": 15515 + }, + { + "epoch": 0.248256, + "grad_norm": 0.69921875, + "learning_rate": 7.578225806451613e-05, + "loss": 0.1936, + "step": 15516 + }, + { + "epoch": 0.248272, + "grad_norm": 0.765625, + "learning_rate": 7.578064516129033e-05, + "loss": 0.1426, + "step": 15517 + }, + { + "epoch": 0.248288, + "grad_norm": 0.83203125, + "learning_rate": 7.577903225806451e-05, + "loss": 0.1363, + "step": 15518 + }, + { + "epoch": 0.248304, + "grad_norm": 0.62890625, + "learning_rate": 7.577741935483871e-05, + "loss": 0.152, + "step": 15519 + }, + { + "epoch": 0.24832, + "grad_norm": 0.62890625, + "learning_rate": 7.577580645161291e-05, + "loss": 0.1372, + "step": 15520 + }, + { + "epoch": 0.248336, + "grad_norm": 0.73828125, + "learning_rate": 7.57741935483871e-05, + "loss": 0.1869, + "step": 15521 + }, + { + "epoch": 0.248352, + "grad_norm": 0.64453125, + "learning_rate": 7.57725806451613e-05, + "loss": 0.1647, + "step": 15522 + }, + { + "epoch": 0.248368, + "grad_norm": 0.72265625, + "learning_rate": 7.577096774193548e-05, + "loss": 0.1574, + "step": 15523 + }, + { + "epoch": 0.248384, + "grad_norm": 0.796875, + "learning_rate": 7.576935483870968e-05, + "loss": 0.222, + "step": 15524 + }, + { + "epoch": 0.2484, + "grad_norm": 1.25, + "learning_rate": 7.576774193548387e-05, + "loss": 0.1804, + "step": 15525 + }, + { + "epoch": 0.248416, + "grad_norm": 0.75, + "learning_rate": 7.576612903225807e-05, + "loss": 0.1974, + "step": 15526 + }, + { + "epoch": 0.248432, + "grad_norm": 0.5859375, + "learning_rate": 7.576451612903226e-05, + "loss": 0.1372, + "step": 15527 + }, + { + "epoch": 0.248448, + "grad_norm": 0.62890625, + "learning_rate": 7.576290322580646e-05, + "loss": 0.1431, + "step": 15528 + }, + { + "epoch": 0.248464, + "grad_norm": 0.6953125, + "learning_rate": 7.576129032258064e-05, + "loss": 0.17, + "step": 15529 + }, + { + "epoch": 0.24848, + "grad_norm": 0.54296875, + "learning_rate": 7.575967741935484e-05, + "loss": 0.1185, + "step": 15530 + }, + { + "epoch": 0.248496, + "grad_norm": 0.609375, + "learning_rate": 7.575806451612904e-05, + "loss": 0.1464, + "step": 15531 + }, + { + "epoch": 0.248512, + "grad_norm": 1.0625, + "learning_rate": 7.575645161290324e-05, + "loss": 0.1574, + "step": 15532 + }, + { + "epoch": 0.248528, + "grad_norm": 0.96484375, + "learning_rate": 7.575483870967743e-05, + "loss": 0.1541, + "step": 15533 + }, + { + "epoch": 0.248544, + "grad_norm": 0.81640625, + "learning_rate": 7.575322580645163e-05, + "loss": 0.1619, + "step": 15534 + }, + { + "epoch": 0.24856, + "grad_norm": 0.8203125, + "learning_rate": 7.575161290322581e-05, + "loss": 0.1584, + "step": 15535 + }, + { + "epoch": 0.248576, + "grad_norm": 0.69140625, + "learning_rate": 7.575e-05, + "loss": 0.1401, + "step": 15536 + }, + { + "epoch": 0.248592, + "grad_norm": 1.046875, + "learning_rate": 7.57483870967742e-05, + "loss": 0.1867, + "step": 15537 + }, + { + "epoch": 0.248608, + "grad_norm": 0.625, + "learning_rate": 7.574677419354838e-05, + "loss": 0.179, + "step": 15538 + }, + { + "epoch": 0.248624, + "grad_norm": 0.73046875, + "learning_rate": 7.574516129032258e-05, + "loss": 0.1691, + "step": 15539 + }, + { + "epoch": 0.24864, + "grad_norm": 0.8984375, + "learning_rate": 7.574354838709677e-05, + "loss": 0.1802, + "step": 15540 + }, + { + "epoch": 0.248656, + "grad_norm": 0.6484375, + "learning_rate": 7.574193548387097e-05, + "loss": 0.1958, + "step": 15541 + }, + { + "epoch": 0.248672, + "grad_norm": 1.0, + "learning_rate": 7.574032258064517e-05, + "loss": 0.1907, + "step": 15542 + }, + { + "epoch": 0.248688, + "grad_norm": 0.66796875, + "learning_rate": 7.573870967741935e-05, + "loss": 0.1401, + "step": 15543 + }, + { + "epoch": 0.248704, + "grad_norm": 0.9296875, + "learning_rate": 7.573709677419355e-05, + "loss": 0.1702, + "step": 15544 + }, + { + "epoch": 0.24872, + "grad_norm": 0.9921875, + "learning_rate": 7.573548387096775e-05, + "loss": 0.1852, + "step": 15545 + }, + { + "epoch": 0.248736, + "grad_norm": 0.7109375, + "learning_rate": 7.573387096774194e-05, + "loss": 0.1748, + "step": 15546 + }, + { + "epoch": 0.248752, + "grad_norm": 0.765625, + "learning_rate": 7.573225806451614e-05, + "loss": 0.1618, + "step": 15547 + }, + { + "epoch": 0.248768, + "grad_norm": 1.078125, + "learning_rate": 7.573064516129033e-05, + "loss": 0.1611, + "step": 15548 + }, + { + "epoch": 0.248784, + "grad_norm": 0.6796875, + "learning_rate": 7.572903225806453e-05, + "loss": 0.1745, + "step": 15549 + }, + { + "epoch": 0.2488, + "grad_norm": 0.9765625, + "learning_rate": 7.572741935483871e-05, + "loss": 0.1285, + "step": 15550 + }, + { + "epoch": 0.248816, + "grad_norm": 1.234375, + "learning_rate": 7.57258064516129e-05, + "loss": 0.1884, + "step": 15551 + }, + { + "epoch": 0.248832, + "grad_norm": 0.84765625, + "learning_rate": 7.57241935483871e-05, + "loss": 0.1887, + "step": 15552 + }, + { + "epoch": 0.248848, + "grad_norm": 1.125, + "learning_rate": 7.572258064516128e-05, + "loss": 0.1588, + "step": 15553 + }, + { + "epoch": 0.248864, + "grad_norm": 0.76953125, + "learning_rate": 7.572096774193548e-05, + "loss": 0.1721, + "step": 15554 + }, + { + "epoch": 0.24888, + "grad_norm": 0.78125, + "learning_rate": 7.571935483870968e-05, + "loss": 0.1859, + "step": 15555 + }, + { + "epoch": 0.248896, + "grad_norm": 0.76171875, + "learning_rate": 7.571774193548388e-05, + "loss": 0.1585, + "step": 15556 + }, + { + "epoch": 0.248912, + "grad_norm": 0.81640625, + "learning_rate": 7.571612903225807e-05, + "loss": 0.154, + "step": 15557 + }, + { + "epoch": 0.248928, + "grad_norm": 0.71875, + "learning_rate": 7.571451612903227e-05, + "loss": 0.2226, + "step": 15558 + }, + { + "epoch": 0.248944, + "grad_norm": 0.80078125, + "learning_rate": 7.571290322580645e-05, + "loss": 0.1852, + "step": 15559 + }, + { + "epoch": 0.24896, + "grad_norm": 0.84765625, + "learning_rate": 7.571129032258065e-05, + "loss": 0.1886, + "step": 15560 + }, + { + "epoch": 0.248976, + "grad_norm": 0.8125, + "learning_rate": 7.570967741935484e-05, + "loss": 0.1825, + "step": 15561 + }, + { + "epoch": 0.248992, + "grad_norm": 0.78515625, + "learning_rate": 7.570806451612904e-05, + "loss": 0.1721, + "step": 15562 + }, + { + "epoch": 0.249008, + "grad_norm": 1.3359375, + "learning_rate": 7.570645161290323e-05, + "loss": 0.1929, + "step": 15563 + }, + { + "epoch": 0.249024, + "grad_norm": 0.5546875, + "learning_rate": 7.570483870967743e-05, + "loss": 0.1677, + "step": 15564 + }, + { + "epoch": 0.24904, + "grad_norm": 0.69140625, + "learning_rate": 7.570322580645161e-05, + "loss": 0.1807, + "step": 15565 + }, + { + "epoch": 0.249056, + "grad_norm": 0.55078125, + "learning_rate": 7.570161290322581e-05, + "loss": 0.1805, + "step": 15566 + }, + { + "epoch": 0.249072, + "grad_norm": 1.109375, + "learning_rate": 7.570000000000001e-05, + "loss": 0.1651, + "step": 15567 + }, + { + "epoch": 0.249088, + "grad_norm": 0.55859375, + "learning_rate": 7.56983870967742e-05, + "loss": 0.1375, + "step": 15568 + }, + { + "epoch": 0.249104, + "grad_norm": 0.61328125, + "learning_rate": 7.56967741935484e-05, + "loss": 0.1683, + "step": 15569 + }, + { + "epoch": 0.24912, + "grad_norm": 0.97265625, + "learning_rate": 7.569516129032258e-05, + "loss": 0.1696, + "step": 15570 + }, + { + "epoch": 0.249136, + "grad_norm": 0.921875, + "learning_rate": 7.569354838709678e-05, + "loss": 0.1514, + "step": 15571 + }, + { + "epoch": 0.249152, + "grad_norm": 1.2890625, + "learning_rate": 7.569193548387097e-05, + "loss": 0.189, + "step": 15572 + }, + { + "epoch": 0.249168, + "grad_norm": 0.89453125, + "learning_rate": 7.569032258064517e-05, + "loss": 0.197, + "step": 15573 + }, + { + "epoch": 0.249184, + "grad_norm": 0.9453125, + "learning_rate": 7.568870967741935e-05, + "loss": 0.147, + "step": 15574 + }, + { + "epoch": 0.2492, + "grad_norm": 0.65234375, + "learning_rate": 7.568709677419355e-05, + "loss": 0.1537, + "step": 15575 + }, + { + "epoch": 0.249216, + "grad_norm": 0.68359375, + "learning_rate": 7.568548387096774e-05, + "loss": 0.184, + "step": 15576 + }, + { + "epoch": 0.249232, + "grad_norm": 0.85546875, + "learning_rate": 7.568387096774194e-05, + "loss": 0.1417, + "step": 15577 + }, + { + "epoch": 0.249248, + "grad_norm": 0.89453125, + "learning_rate": 7.568225806451613e-05, + "loss": 0.1261, + "step": 15578 + }, + { + "epoch": 0.249264, + "grad_norm": 0.8046875, + "learning_rate": 7.568064516129032e-05, + "loss": 0.1758, + "step": 15579 + }, + { + "epoch": 0.24928, + "grad_norm": 0.76171875, + "learning_rate": 7.567903225806452e-05, + "loss": 0.1803, + "step": 15580 + }, + { + "epoch": 0.249296, + "grad_norm": 0.8359375, + "learning_rate": 7.567741935483871e-05, + "loss": 0.1668, + "step": 15581 + }, + { + "epoch": 0.249312, + "grad_norm": 1.0234375, + "learning_rate": 7.567580645161291e-05, + "loss": 0.116, + "step": 15582 + }, + { + "epoch": 0.249328, + "grad_norm": 0.7578125, + "learning_rate": 7.56741935483871e-05, + "loss": 0.1531, + "step": 15583 + }, + { + "epoch": 0.249344, + "grad_norm": 1.0859375, + "learning_rate": 7.56725806451613e-05, + "loss": 0.1888, + "step": 15584 + }, + { + "epoch": 0.24936, + "grad_norm": 0.875, + "learning_rate": 7.567096774193548e-05, + "loss": 0.1759, + "step": 15585 + }, + { + "epoch": 0.249376, + "grad_norm": 0.75, + "learning_rate": 7.566935483870968e-05, + "loss": 0.1679, + "step": 15586 + }, + { + "epoch": 0.249392, + "grad_norm": 1.0546875, + "learning_rate": 7.566774193548387e-05, + "loss": 0.1895, + "step": 15587 + }, + { + "epoch": 0.249408, + "grad_norm": 0.7421875, + "learning_rate": 7.566612903225807e-05, + "loss": 0.1902, + "step": 15588 + }, + { + "epoch": 0.249424, + "grad_norm": 0.734375, + "learning_rate": 7.566451612903225e-05, + "loss": 0.1671, + "step": 15589 + }, + { + "epoch": 0.24944, + "grad_norm": 0.75, + "learning_rate": 7.566290322580645e-05, + "loss": 0.149, + "step": 15590 + }, + { + "epoch": 0.249456, + "grad_norm": 0.734375, + "learning_rate": 7.566129032258065e-05, + "loss": 0.1674, + "step": 15591 + }, + { + "epoch": 0.249472, + "grad_norm": 0.83984375, + "learning_rate": 7.565967741935485e-05, + "loss": 0.154, + "step": 15592 + }, + { + "epoch": 0.249488, + "grad_norm": 1.125, + "learning_rate": 7.565806451612904e-05, + "loss": 0.1649, + "step": 15593 + }, + { + "epoch": 0.249504, + "grad_norm": 0.6796875, + "learning_rate": 7.565645161290324e-05, + "loss": 0.1179, + "step": 15594 + }, + { + "epoch": 0.24952, + "grad_norm": 0.69140625, + "learning_rate": 7.565483870967742e-05, + "loss": 0.1523, + "step": 15595 + }, + { + "epoch": 0.249536, + "grad_norm": 0.6875, + "learning_rate": 7.565322580645162e-05, + "loss": 0.1673, + "step": 15596 + }, + { + "epoch": 0.249552, + "grad_norm": 0.77734375, + "learning_rate": 7.565161290322581e-05, + "loss": 0.1564, + "step": 15597 + }, + { + "epoch": 0.249568, + "grad_norm": 0.76171875, + "learning_rate": 7.565e-05, + "loss": 0.1296, + "step": 15598 + }, + { + "epoch": 0.249584, + "grad_norm": 0.98046875, + "learning_rate": 7.56483870967742e-05, + "loss": 0.1665, + "step": 15599 + }, + { + "epoch": 0.2496, + "grad_norm": 0.60546875, + "learning_rate": 7.564677419354838e-05, + "loss": 0.1268, + "step": 15600 + }, + { + "epoch": 0.249616, + "grad_norm": 0.6484375, + "learning_rate": 7.564516129032258e-05, + "loss": 0.1843, + "step": 15601 + }, + { + "epoch": 0.249632, + "grad_norm": 0.77734375, + "learning_rate": 7.564354838709678e-05, + "loss": 0.1679, + "step": 15602 + }, + { + "epoch": 0.249648, + "grad_norm": 0.8671875, + "learning_rate": 7.564193548387098e-05, + "loss": 0.1802, + "step": 15603 + }, + { + "epoch": 0.249664, + "grad_norm": 0.66015625, + "learning_rate": 7.564032258064517e-05, + "loss": 0.1574, + "step": 15604 + }, + { + "epoch": 0.24968, + "grad_norm": 1.4140625, + "learning_rate": 7.563870967741937e-05, + "loss": 0.1597, + "step": 15605 + }, + { + "epoch": 0.249696, + "grad_norm": 1.34375, + "learning_rate": 7.563709677419355e-05, + "loss": 0.1832, + "step": 15606 + }, + { + "epoch": 0.249712, + "grad_norm": 0.92578125, + "learning_rate": 7.563548387096775e-05, + "loss": 0.215, + "step": 15607 + }, + { + "epoch": 0.249728, + "grad_norm": 0.96875, + "learning_rate": 7.563387096774194e-05, + "loss": 0.172, + "step": 15608 + }, + { + "epoch": 0.249744, + "grad_norm": 0.65625, + "learning_rate": 7.563225806451614e-05, + "loss": 0.2041, + "step": 15609 + }, + { + "epoch": 0.24976, + "grad_norm": 1.0625, + "learning_rate": 7.563064516129032e-05, + "loss": 0.1507, + "step": 15610 + }, + { + "epoch": 0.249776, + "grad_norm": 0.8828125, + "learning_rate": 7.562903225806452e-05, + "loss": 0.145, + "step": 15611 + }, + { + "epoch": 0.249792, + "grad_norm": 0.83984375, + "learning_rate": 7.562741935483871e-05, + "loss": 0.1365, + "step": 15612 + }, + { + "epoch": 0.249808, + "grad_norm": 0.62890625, + "learning_rate": 7.56258064516129e-05, + "loss": 0.1374, + "step": 15613 + }, + { + "epoch": 0.249824, + "grad_norm": 0.9765625, + "learning_rate": 7.56241935483871e-05, + "loss": 0.2205, + "step": 15614 + }, + { + "epoch": 0.24984, + "grad_norm": 0.90234375, + "learning_rate": 7.56225806451613e-05, + "loss": 0.2392, + "step": 15615 + }, + { + "epoch": 0.249856, + "grad_norm": 0.85546875, + "learning_rate": 7.56209677419355e-05, + "loss": 0.2159, + "step": 15616 + }, + { + "epoch": 0.249872, + "grad_norm": 0.87890625, + "learning_rate": 7.561935483870968e-05, + "loss": 0.1985, + "step": 15617 + }, + { + "epoch": 0.249888, + "grad_norm": 0.64453125, + "learning_rate": 7.561774193548388e-05, + "loss": 0.1584, + "step": 15618 + }, + { + "epoch": 0.249904, + "grad_norm": 0.73046875, + "learning_rate": 7.561612903225807e-05, + "loss": 0.171, + "step": 15619 + }, + { + "epoch": 0.24992, + "grad_norm": 0.62109375, + "learning_rate": 7.561451612903227e-05, + "loss": 0.1501, + "step": 15620 + }, + { + "epoch": 0.249936, + "grad_norm": 1.1875, + "learning_rate": 7.561290322580645e-05, + "loss": 0.1595, + "step": 15621 + }, + { + "epoch": 0.249952, + "grad_norm": 1.03125, + "learning_rate": 7.561129032258065e-05, + "loss": 0.1518, + "step": 15622 + }, + { + "epoch": 0.249968, + "grad_norm": 1.09375, + "learning_rate": 7.560967741935484e-05, + "loss": 0.1802, + "step": 15623 + }, + { + "epoch": 0.249984, + "grad_norm": 0.83203125, + "learning_rate": 7.560806451612904e-05, + "loss": 0.1513, + "step": 15624 + }, + { + "epoch": 0.25, + "grad_norm": 0.73046875, + "learning_rate": 7.560645161290322e-05, + "loss": 0.1828, + "step": 15625 + }, + { + "epoch": 0.250016, + "grad_norm": 1.109375, + "learning_rate": 7.560483870967742e-05, + "loss": 0.1675, + "step": 15626 + }, + { + "epoch": 0.250032, + "grad_norm": 0.73828125, + "learning_rate": 7.560322580645162e-05, + "loss": 0.1686, + "step": 15627 + }, + { + "epoch": 0.250048, + "grad_norm": 1.734375, + "learning_rate": 7.560161290322581e-05, + "loss": 0.155, + "step": 15628 + }, + { + "epoch": 0.250064, + "grad_norm": 0.71875, + "learning_rate": 7.560000000000001e-05, + "loss": 0.1438, + "step": 15629 + }, + { + "epoch": 0.25008, + "grad_norm": 0.88671875, + "learning_rate": 7.55983870967742e-05, + "loss": 0.1918, + "step": 15630 + }, + { + "epoch": 0.250096, + "grad_norm": 1.0234375, + "learning_rate": 7.55967741935484e-05, + "loss": 0.1796, + "step": 15631 + }, + { + "epoch": 0.250112, + "grad_norm": 1.1796875, + "learning_rate": 7.559516129032258e-05, + "loss": 0.1921, + "step": 15632 + }, + { + "epoch": 0.250128, + "grad_norm": 0.9140625, + "learning_rate": 7.559354838709678e-05, + "loss": 0.1744, + "step": 15633 + }, + { + "epoch": 0.250144, + "grad_norm": 0.6015625, + "learning_rate": 7.559193548387097e-05, + "loss": 0.1762, + "step": 15634 + }, + { + "epoch": 0.25016, + "grad_norm": 1.4609375, + "learning_rate": 7.559032258064517e-05, + "loss": 0.1763, + "step": 15635 + }, + { + "epoch": 0.250176, + "grad_norm": 0.8984375, + "learning_rate": 7.558870967741935e-05, + "loss": 0.1712, + "step": 15636 + }, + { + "epoch": 0.250192, + "grad_norm": 1.109375, + "learning_rate": 7.558709677419355e-05, + "loss": 0.2091, + "step": 15637 + }, + { + "epoch": 0.250208, + "grad_norm": 0.72265625, + "learning_rate": 7.558548387096775e-05, + "loss": 0.1788, + "step": 15638 + }, + { + "epoch": 0.250224, + "grad_norm": 1.0234375, + "learning_rate": 7.558387096774194e-05, + "loss": 0.1627, + "step": 15639 + }, + { + "epoch": 0.25024, + "grad_norm": 0.82421875, + "learning_rate": 7.558225806451614e-05, + "loss": 0.1875, + "step": 15640 + }, + { + "epoch": 0.250256, + "grad_norm": 0.7421875, + "learning_rate": 7.558064516129034e-05, + "loss": 0.1577, + "step": 15641 + }, + { + "epoch": 0.250272, + "grad_norm": 1.3515625, + "learning_rate": 7.557903225806452e-05, + "loss": 0.194, + "step": 15642 + }, + { + "epoch": 0.250288, + "grad_norm": 1.0625, + "learning_rate": 7.557741935483872e-05, + "loss": 0.1932, + "step": 15643 + }, + { + "epoch": 0.250304, + "grad_norm": 0.609375, + "learning_rate": 7.557580645161291e-05, + "loss": 0.1163, + "step": 15644 + }, + { + "epoch": 0.25032, + "grad_norm": 0.734375, + "learning_rate": 7.55741935483871e-05, + "loss": 0.1579, + "step": 15645 + }, + { + "epoch": 0.250336, + "grad_norm": 0.875, + "learning_rate": 7.55725806451613e-05, + "loss": 0.1827, + "step": 15646 + }, + { + "epoch": 0.250352, + "grad_norm": 0.67578125, + "learning_rate": 7.557096774193548e-05, + "loss": 0.1617, + "step": 15647 + }, + { + "epoch": 0.250368, + "grad_norm": 0.73828125, + "learning_rate": 7.556935483870968e-05, + "loss": 0.1751, + "step": 15648 + }, + { + "epoch": 0.250384, + "grad_norm": 0.50390625, + "learning_rate": 7.556774193548387e-05, + "loss": 0.1494, + "step": 15649 + }, + { + "epoch": 0.2504, + "grad_norm": 1.3671875, + "learning_rate": 7.556612903225806e-05, + "loss": 0.1958, + "step": 15650 + }, + { + "epoch": 0.250416, + "grad_norm": 0.73046875, + "learning_rate": 7.556451612903226e-05, + "loss": 0.1694, + "step": 15651 + }, + { + "epoch": 0.250432, + "grad_norm": 0.94140625, + "learning_rate": 7.556290322580646e-05, + "loss": 0.2127, + "step": 15652 + }, + { + "epoch": 0.250448, + "grad_norm": 1.0390625, + "learning_rate": 7.556129032258065e-05, + "loss": 0.1858, + "step": 15653 + }, + { + "epoch": 0.250464, + "grad_norm": 0.5625, + "learning_rate": 7.555967741935485e-05, + "loss": 0.1313, + "step": 15654 + }, + { + "epoch": 0.25048, + "grad_norm": 1.390625, + "learning_rate": 7.555806451612904e-05, + "loss": 0.2087, + "step": 15655 + }, + { + "epoch": 0.250496, + "grad_norm": 0.640625, + "learning_rate": 7.555645161290324e-05, + "loss": 0.1369, + "step": 15656 + }, + { + "epoch": 0.250512, + "grad_norm": 1.3828125, + "learning_rate": 7.555483870967742e-05, + "loss": 0.2005, + "step": 15657 + }, + { + "epoch": 0.250528, + "grad_norm": 0.70703125, + "learning_rate": 7.555322580645162e-05, + "loss": 0.1616, + "step": 15658 + }, + { + "epoch": 0.250544, + "grad_norm": 0.7578125, + "learning_rate": 7.555161290322581e-05, + "loss": 0.1454, + "step": 15659 + }, + { + "epoch": 0.25056, + "grad_norm": 0.98046875, + "learning_rate": 7.555e-05, + "loss": 0.2002, + "step": 15660 + }, + { + "epoch": 0.250576, + "grad_norm": 0.79296875, + "learning_rate": 7.554838709677419e-05, + "loss": 0.1614, + "step": 15661 + }, + { + "epoch": 0.250592, + "grad_norm": 1.484375, + "learning_rate": 7.554677419354839e-05, + "loss": 0.2334, + "step": 15662 + }, + { + "epoch": 0.250608, + "grad_norm": 1.296875, + "learning_rate": 7.554516129032259e-05, + "loss": 0.1956, + "step": 15663 + }, + { + "epoch": 0.250624, + "grad_norm": 0.890625, + "learning_rate": 7.554354838709678e-05, + "loss": 0.1861, + "step": 15664 + }, + { + "epoch": 0.25064, + "grad_norm": 0.75, + "learning_rate": 7.554193548387098e-05, + "loss": 0.17, + "step": 15665 + }, + { + "epoch": 0.250656, + "grad_norm": 0.71484375, + "learning_rate": 7.554032258064516e-05, + "loss": 0.1781, + "step": 15666 + }, + { + "epoch": 0.250672, + "grad_norm": 0.5703125, + "learning_rate": 7.553870967741936e-05, + "loss": 0.1557, + "step": 15667 + }, + { + "epoch": 0.250688, + "grad_norm": 0.71484375, + "learning_rate": 7.553709677419355e-05, + "loss": 0.1601, + "step": 15668 + }, + { + "epoch": 0.250704, + "grad_norm": 0.67578125, + "learning_rate": 7.553548387096775e-05, + "loss": 0.1975, + "step": 15669 + }, + { + "epoch": 0.25072, + "grad_norm": 0.9609375, + "learning_rate": 7.553387096774194e-05, + "loss": 0.1534, + "step": 15670 + }, + { + "epoch": 0.250736, + "grad_norm": 1.0625, + "learning_rate": 7.553225806451613e-05, + "loss": 0.178, + "step": 15671 + }, + { + "epoch": 0.250752, + "grad_norm": 1.0703125, + "learning_rate": 7.553064516129032e-05, + "loss": 0.2269, + "step": 15672 + }, + { + "epoch": 0.250768, + "grad_norm": 0.80078125, + "learning_rate": 7.552903225806452e-05, + "loss": 0.1432, + "step": 15673 + }, + { + "epoch": 0.250784, + "grad_norm": 0.87890625, + "learning_rate": 7.55274193548387e-05, + "loss": 0.1981, + "step": 15674 + }, + { + "epoch": 0.2508, + "grad_norm": 0.64453125, + "learning_rate": 7.55258064516129e-05, + "loss": 0.17, + "step": 15675 + }, + { + "epoch": 0.250816, + "grad_norm": 1.046875, + "learning_rate": 7.55241935483871e-05, + "loss": 0.1931, + "step": 15676 + }, + { + "epoch": 0.250832, + "grad_norm": 0.73046875, + "learning_rate": 7.552258064516129e-05, + "loss": 0.1736, + "step": 15677 + }, + { + "epoch": 0.250848, + "grad_norm": 0.6953125, + "learning_rate": 7.552096774193549e-05, + "loss": 0.2116, + "step": 15678 + }, + { + "epoch": 0.250864, + "grad_norm": 0.76953125, + "learning_rate": 7.551935483870968e-05, + "loss": 0.1374, + "step": 15679 + }, + { + "epoch": 0.25088, + "grad_norm": 0.54296875, + "learning_rate": 7.551774193548388e-05, + "loss": 0.1537, + "step": 15680 + }, + { + "epoch": 0.250896, + "grad_norm": 0.734375, + "learning_rate": 7.551612903225806e-05, + "loss": 0.1553, + "step": 15681 + }, + { + "epoch": 0.250912, + "grad_norm": 0.859375, + "learning_rate": 7.551451612903226e-05, + "loss": 0.1579, + "step": 15682 + }, + { + "epoch": 0.250928, + "grad_norm": 0.95703125, + "learning_rate": 7.551290322580645e-05, + "loss": 0.1722, + "step": 15683 + }, + { + "epoch": 0.250944, + "grad_norm": 0.67578125, + "learning_rate": 7.551129032258065e-05, + "loss": 0.1626, + "step": 15684 + }, + { + "epoch": 0.25096, + "grad_norm": 0.60546875, + "learning_rate": 7.550967741935483e-05, + "loss": 0.2037, + "step": 15685 + }, + { + "epoch": 0.250976, + "grad_norm": 0.6796875, + "learning_rate": 7.550806451612903e-05, + "loss": 0.2111, + "step": 15686 + }, + { + "epoch": 0.250992, + "grad_norm": 0.7578125, + "learning_rate": 7.550645161290323e-05, + "loss": 0.1635, + "step": 15687 + }, + { + "epoch": 0.251008, + "grad_norm": 0.703125, + "learning_rate": 7.550483870967743e-05, + "loss": 0.174, + "step": 15688 + }, + { + "epoch": 0.251024, + "grad_norm": 0.7421875, + "learning_rate": 7.550322580645162e-05, + "loss": 0.2155, + "step": 15689 + }, + { + "epoch": 0.25104, + "grad_norm": 0.49609375, + "learning_rate": 7.55016129032258e-05, + "loss": 0.1711, + "step": 15690 + }, + { + "epoch": 0.251056, + "grad_norm": 0.5859375, + "learning_rate": 7.55e-05, + "loss": 0.1764, + "step": 15691 + }, + { + "epoch": 0.251072, + "grad_norm": 0.6171875, + "learning_rate": 7.549838709677419e-05, + "loss": 0.1389, + "step": 15692 + }, + { + "epoch": 0.251088, + "grad_norm": 0.5, + "learning_rate": 7.549677419354839e-05, + "loss": 0.1462, + "step": 15693 + }, + { + "epoch": 0.251104, + "grad_norm": 0.67578125, + "learning_rate": 7.549516129032258e-05, + "loss": 0.1673, + "step": 15694 + }, + { + "epoch": 0.25112, + "grad_norm": 0.80078125, + "learning_rate": 7.549354838709678e-05, + "loss": 0.1744, + "step": 15695 + }, + { + "epoch": 0.251136, + "grad_norm": 0.85546875, + "learning_rate": 7.549193548387096e-05, + "loss": 0.1747, + "step": 15696 + }, + { + "epoch": 0.251152, + "grad_norm": 0.625, + "learning_rate": 7.549032258064516e-05, + "loss": 0.1197, + "step": 15697 + }, + { + "epoch": 0.251168, + "grad_norm": 0.83203125, + "learning_rate": 7.548870967741936e-05, + "loss": 0.1459, + "step": 15698 + }, + { + "epoch": 0.251184, + "grad_norm": 0.9296875, + "learning_rate": 7.548709677419356e-05, + "loss": 0.2256, + "step": 15699 + }, + { + "epoch": 0.2512, + "grad_norm": 0.53515625, + "learning_rate": 7.548548387096775e-05, + "loss": 0.1422, + "step": 15700 + }, + { + "epoch": 0.251216, + "grad_norm": 0.58203125, + "learning_rate": 7.548387096774195e-05, + "loss": 0.1641, + "step": 15701 + }, + { + "epoch": 0.251232, + "grad_norm": 0.75390625, + "learning_rate": 7.548225806451613e-05, + "loss": 0.1708, + "step": 15702 + }, + { + "epoch": 0.251248, + "grad_norm": 0.71875, + "learning_rate": 7.548064516129033e-05, + "loss": 0.1465, + "step": 15703 + }, + { + "epoch": 0.251264, + "grad_norm": 0.703125, + "learning_rate": 7.547903225806452e-05, + "loss": 0.1413, + "step": 15704 + }, + { + "epoch": 0.25128, + "grad_norm": 0.90625, + "learning_rate": 7.547741935483872e-05, + "loss": 0.1715, + "step": 15705 + }, + { + "epoch": 0.251296, + "grad_norm": 0.63671875, + "learning_rate": 7.54758064516129e-05, + "loss": 0.1787, + "step": 15706 + }, + { + "epoch": 0.251312, + "grad_norm": 1.0234375, + "learning_rate": 7.547419354838709e-05, + "loss": 0.2126, + "step": 15707 + }, + { + "epoch": 0.251328, + "grad_norm": 0.765625, + "learning_rate": 7.547258064516129e-05, + "loss": 0.1597, + "step": 15708 + }, + { + "epoch": 0.251344, + "grad_norm": 0.5, + "learning_rate": 7.547096774193548e-05, + "loss": 0.1826, + "step": 15709 + }, + { + "epoch": 0.25136, + "grad_norm": 1.078125, + "learning_rate": 7.546935483870968e-05, + "loss": 0.2261, + "step": 15710 + }, + { + "epoch": 0.251376, + "grad_norm": 1.2578125, + "learning_rate": 7.546774193548388e-05, + "loss": 0.2152, + "step": 15711 + }, + { + "epoch": 0.251392, + "grad_norm": 0.9453125, + "learning_rate": 7.546612903225808e-05, + "loss": 0.1756, + "step": 15712 + }, + { + "epoch": 0.251408, + "grad_norm": 1.109375, + "learning_rate": 7.546451612903226e-05, + "loss": 0.1751, + "step": 15713 + }, + { + "epoch": 0.251424, + "grad_norm": 0.98046875, + "learning_rate": 7.546290322580646e-05, + "loss": 0.2301, + "step": 15714 + }, + { + "epoch": 0.25144, + "grad_norm": 1.015625, + "learning_rate": 7.546129032258065e-05, + "loss": 0.154, + "step": 15715 + }, + { + "epoch": 0.251456, + "grad_norm": 0.65625, + "learning_rate": 7.545967741935485e-05, + "loss": 0.1355, + "step": 15716 + }, + { + "epoch": 0.251472, + "grad_norm": 0.55078125, + "learning_rate": 7.545806451612903e-05, + "loss": 0.1659, + "step": 15717 + }, + { + "epoch": 0.251488, + "grad_norm": 0.58984375, + "learning_rate": 7.545645161290323e-05, + "loss": 0.1419, + "step": 15718 + }, + { + "epoch": 0.251504, + "grad_norm": 0.73046875, + "learning_rate": 7.545483870967742e-05, + "loss": 0.1796, + "step": 15719 + }, + { + "epoch": 0.25152, + "grad_norm": 0.921875, + "learning_rate": 7.545322580645162e-05, + "loss": 0.1382, + "step": 15720 + }, + { + "epoch": 0.251536, + "grad_norm": 0.7578125, + "learning_rate": 7.54516129032258e-05, + "loss": 0.1898, + "step": 15721 + }, + { + "epoch": 0.251552, + "grad_norm": 0.490234375, + "learning_rate": 7.545e-05, + "loss": 0.1384, + "step": 15722 + }, + { + "epoch": 0.251568, + "grad_norm": 0.7734375, + "learning_rate": 7.54483870967742e-05, + "loss": 0.2073, + "step": 15723 + }, + { + "epoch": 0.251584, + "grad_norm": 0.71875, + "learning_rate": 7.544677419354839e-05, + "loss": 0.1973, + "step": 15724 + }, + { + "epoch": 0.2516, + "grad_norm": 0.91015625, + "learning_rate": 7.544516129032259e-05, + "loss": 0.1832, + "step": 15725 + }, + { + "epoch": 0.251616, + "grad_norm": 0.6875, + "learning_rate": 7.544354838709678e-05, + "loss": 0.1652, + "step": 15726 + }, + { + "epoch": 0.251632, + "grad_norm": 0.71875, + "learning_rate": 7.544193548387098e-05, + "loss": 0.1929, + "step": 15727 + }, + { + "epoch": 0.251648, + "grad_norm": 0.95703125, + "learning_rate": 7.544032258064516e-05, + "loss": 0.1723, + "step": 15728 + }, + { + "epoch": 0.251664, + "grad_norm": 0.6640625, + "learning_rate": 7.543870967741936e-05, + "loss": 0.1418, + "step": 15729 + }, + { + "epoch": 0.25168, + "grad_norm": 0.63671875, + "learning_rate": 7.543709677419355e-05, + "loss": 0.1463, + "step": 15730 + }, + { + "epoch": 0.251696, + "grad_norm": 0.67578125, + "learning_rate": 7.543548387096775e-05, + "loss": 0.162, + "step": 15731 + }, + { + "epoch": 0.251712, + "grad_norm": 0.5703125, + "learning_rate": 7.543387096774193e-05, + "loss": 0.1524, + "step": 15732 + }, + { + "epoch": 0.251728, + "grad_norm": 0.8984375, + "learning_rate": 7.543225806451613e-05, + "loss": 0.1961, + "step": 15733 + }, + { + "epoch": 0.251744, + "grad_norm": 0.58203125, + "learning_rate": 7.543064516129032e-05, + "loss": 0.1562, + "step": 15734 + }, + { + "epoch": 0.25176, + "grad_norm": 0.734375, + "learning_rate": 7.542903225806452e-05, + "loss": 0.182, + "step": 15735 + }, + { + "epoch": 0.251776, + "grad_norm": 0.78125, + "learning_rate": 7.542741935483872e-05, + "loss": 0.1762, + "step": 15736 + }, + { + "epoch": 0.251792, + "grad_norm": 1.15625, + "learning_rate": 7.54258064516129e-05, + "loss": 0.1986, + "step": 15737 + }, + { + "epoch": 0.251808, + "grad_norm": 0.59765625, + "learning_rate": 7.54241935483871e-05, + "loss": 0.1584, + "step": 15738 + }, + { + "epoch": 0.251824, + "grad_norm": 0.8359375, + "learning_rate": 7.542258064516129e-05, + "loss": 0.1658, + "step": 15739 + }, + { + "epoch": 0.25184, + "grad_norm": 0.8515625, + "learning_rate": 7.542096774193549e-05, + "loss": 0.1773, + "step": 15740 + }, + { + "epoch": 0.251856, + "grad_norm": 0.9765625, + "learning_rate": 7.541935483870968e-05, + "loss": 0.1331, + "step": 15741 + }, + { + "epoch": 0.251872, + "grad_norm": 0.80859375, + "learning_rate": 7.541774193548387e-05, + "loss": 0.145, + "step": 15742 + }, + { + "epoch": 0.251888, + "grad_norm": 0.75390625, + "learning_rate": 7.541612903225806e-05, + "loss": 0.1667, + "step": 15743 + }, + { + "epoch": 0.251904, + "grad_norm": 0.8984375, + "learning_rate": 7.541451612903226e-05, + "loss": 0.1771, + "step": 15744 + }, + { + "epoch": 0.25192, + "grad_norm": 1.171875, + "learning_rate": 7.541290322580645e-05, + "loss": 0.1889, + "step": 15745 + }, + { + "epoch": 0.251936, + "grad_norm": 1.0546875, + "learning_rate": 7.541129032258065e-05, + "loss": 0.1428, + "step": 15746 + }, + { + "epoch": 0.251952, + "grad_norm": 0.828125, + "learning_rate": 7.540967741935485e-05, + "loss": 0.1382, + "step": 15747 + }, + { + "epoch": 0.251968, + "grad_norm": 0.5, + "learning_rate": 7.540806451612905e-05, + "loss": 0.1146, + "step": 15748 + }, + { + "epoch": 0.251984, + "grad_norm": 0.671875, + "learning_rate": 7.540645161290323e-05, + "loss": 0.1267, + "step": 15749 + }, + { + "epoch": 0.252, + "grad_norm": 0.65234375, + "learning_rate": 7.540483870967743e-05, + "loss": 0.1545, + "step": 15750 + }, + { + "epoch": 0.252016, + "grad_norm": 0.578125, + "learning_rate": 7.540322580645162e-05, + "loss": 0.1653, + "step": 15751 + }, + { + "epoch": 0.252032, + "grad_norm": 0.59765625, + "learning_rate": 7.540161290322582e-05, + "loss": 0.1449, + "step": 15752 + }, + { + "epoch": 0.252048, + "grad_norm": 1.2109375, + "learning_rate": 7.54e-05, + "loss": 0.2202, + "step": 15753 + }, + { + "epoch": 0.252064, + "grad_norm": 0.703125, + "learning_rate": 7.539838709677419e-05, + "loss": 0.1452, + "step": 15754 + }, + { + "epoch": 0.25208, + "grad_norm": 0.76953125, + "learning_rate": 7.539677419354839e-05, + "loss": 0.1922, + "step": 15755 + }, + { + "epoch": 0.252096, + "grad_norm": 0.90234375, + "learning_rate": 7.539516129032257e-05, + "loss": 0.2128, + "step": 15756 + }, + { + "epoch": 0.252112, + "grad_norm": 0.6484375, + "learning_rate": 7.539354838709677e-05, + "loss": 0.1419, + "step": 15757 + }, + { + "epoch": 0.252128, + "grad_norm": 0.859375, + "learning_rate": 7.539193548387097e-05, + "loss": 0.1446, + "step": 15758 + }, + { + "epoch": 0.252144, + "grad_norm": 0.84375, + "learning_rate": 7.539032258064517e-05, + "loss": 0.1732, + "step": 15759 + }, + { + "epoch": 0.25216, + "grad_norm": 0.828125, + "learning_rate": 7.538870967741936e-05, + "loss": 0.192, + "step": 15760 + }, + { + "epoch": 0.252176, + "grad_norm": 0.640625, + "learning_rate": 7.538709677419356e-05, + "loss": 0.1529, + "step": 15761 + }, + { + "epoch": 0.252192, + "grad_norm": 0.90234375, + "learning_rate": 7.538548387096775e-05, + "loss": 0.1804, + "step": 15762 + }, + { + "epoch": 0.252208, + "grad_norm": 1.0390625, + "learning_rate": 7.538387096774195e-05, + "loss": 0.1677, + "step": 15763 + }, + { + "epoch": 0.252224, + "grad_norm": 0.89453125, + "learning_rate": 7.538225806451613e-05, + "loss": 0.1357, + "step": 15764 + }, + { + "epoch": 0.25224, + "grad_norm": 0.609375, + "learning_rate": 7.538064516129033e-05, + "loss": 0.1753, + "step": 15765 + }, + { + "epoch": 0.252256, + "grad_norm": 0.56640625, + "learning_rate": 7.537903225806452e-05, + "loss": 0.163, + "step": 15766 + }, + { + "epoch": 0.252272, + "grad_norm": 0.84765625, + "learning_rate": 7.537741935483872e-05, + "loss": 0.1999, + "step": 15767 + }, + { + "epoch": 0.252288, + "grad_norm": 0.8828125, + "learning_rate": 7.53758064516129e-05, + "loss": 0.1704, + "step": 15768 + }, + { + "epoch": 0.252304, + "grad_norm": 1.0859375, + "learning_rate": 7.537419354838709e-05, + "loss": 0.1916, + "step": 15769 + }, + { + "epoch": 0.25232, + "grad_norm": 1.0625, + "learning_rate": 7.537258064516129e-05, + "loss": 0.1401, + "step": 15770 + }, + { + "epoch": 0.252336, + "grad_norm": 0.57421875, + "learning_rate": 7.537096774193549e-05, + "loss": 0.1441, + "step": 15771 + }, + { + "epoch": 0.252352, + "grad_norm": 1.328125, + "learning_rate": 7.536935483870969e-05, + "loss": 0.2108, + "step": 15772 + }, + { + "epoch": 0.252368, + "grad_norm": 0.62890625, + "learning_rate": 7.536774193548387e-05, + "loss": 0.1862, + "step": 15773 + }, + { + "epoch": 0.252384, + "grad_norm": 1.25, + "learning_rate": 7.536612903225807e-05, + "loss": 0.1946, + "step": 15774 + }, + { + "epoch": 0.2524, + "grad_norm": 0.71875, + "learning_rate": 7.536451612903226e-05, + "loss": 0.181, + "step": 15775 + }, + { + "epoch": 0.252416, + "grad_norm": 0.68359375, + "learning_rate": 7.536290322580646e-05, + "loss": 0.1253, + "step": 15776 + }, + { + "epoch": 0.252432, + "grad_norm": 0.671875, + "learning_rate": 7.536129032258065e-05, + "loss": 0.1799, + "step": 15777 + }, + { + "epoch": 0.252448, + "grad_norm": 0.70703125, + "learning_rate": 7.535967741935484e-05, + "loss": 0.1643, + "step": 15778 + }, + { + "epoch": 0.252464, + "grad_norm": 0.6484375, + "learning_rate": 7.535806451612903e-05, + "loss": 0.203, + "step": 15779 + }, + { + "epoch": 0.25248, + "grad_norm": 1.0078125, + "learning_rate": 7.535645161290323e-05, + "loss": 0.1669, + "step": 15780 + }, + { + "epoch": 0.252496, + "grad_norm": 0.73046875, + "learning_rate": 7.535483870967742e-05, + "loss": 0.1851, + "step": 15781 + }, + { + "epoch": 0.252512, + "grad_norm": 0.6328125, + "learning_rate": 7.535322580645162e-05, + "loss": 0.1738, + "step": 15782 + }, + { + "epoch": 0.252528, + "grad_norm": 0.86328125, + "learning_rate": 7.535161290322582e-05, + "loss": 0.1775, + "step": 15783 + }, + { + "epoch": 0.252544, + "grad_norm": 0.84765625, + "learning_rate": 7.535e-05, + "loss": 0.1921, + "step": 15784 + }, + { + "epoch": 0.25256, + "grad_norm": 0.65625, + "learning_rate": 7.53483870967742e-05, + "loss": 0.1655, + "step": 15785 + }, + { + "epoch": 0.252576, + "grad_norm": 0.9453125, + "learning_rate": 7.534677419354839e-05, + "loss": 0.1371, + "step": 15786 + }, + { + "epoch": 0.252592, + "grad_norm": 0.58203125, + "learning_rate": 7.534516129032259e-05, + "loss": 0.1977, + "step": 15787 + }, + { + "epoch": 0.252608, + "grad_norm": 0.80078125, + "learning_rate": 7.534354838709677e-05, + "loss": 0.1709, + "step": 15788 + }, + { + "epoch": 0.252624, + "grad_norm": 0.59375, + "learning_rate": 7.534193548387097e-05, + "loss": 0.1679, + "step": 15789 + }, + { + "epoch": 0.25264, + "grad_norm": 0.84375, + "learning_rate": 7.534032258064516e-05, + "loss": 0.1927, + "step": 15790 + }, + { + "epoch": 0.252656, + "grad_norm": 0.8046875, + "learning_rate": 7.533870967741936e-05, + "loss": 0.1892, + "step": 15791 + }, + { + "epoch": 0.252672, + "grad_norm": 0.56640625, + "learning_rate": 7.533709677419354e-05, + "loss": 0.1731, + "step": 15792 + }, + { + "epoch": 0.252688, + "grad_norm": 1.6875, + "learning_rate": 7.533548387096774e-05, + "loss": 0.1585, + "step": 15793 + }, + { + "epoch": 0.252704, + "grad_norm": 1.3671875, + "learning_rate": 7.533387096774194e-05, + "loss": 0.1641, + "step": 15794 + }, + { + "epoch": 0.25272, + "grad_norm": 1.3515625, + "learning_rate": 7.533225806451613e-05, + "loss": 0.1753, + "step": 15795 + }, + { + "epoch": 0.252736, + "grad_norm": 0.65625, + "learning_rate": 7.533064516129033e-05, + "loss": 0.171, + "step": 15796 + }, + { + "epoch": 0.252752, + "grad_norm": 0.8203125, + "learning_rate": 7.532903225806453e-05, + "loss": 0.1883, + "step": 15797 + }, + { + "epoch": 0.252768, + "grad_norm": 0.796875, + "learning_rate": 7.532741935483872e-05, + "loss": 0.1782, + "step": 15798 + }, + { + "epoch": 0.252784, + "grad_norm": 1.15625, + "learning_rate": 7.53258064516129e-05, + "loss": 0.1827, + "step": 15799 + }, + { + "epoch": 0.2528, + "grad_norm": 0.91796875, + "learning_rate": 7.53241935483871e-05, + "loss": 0.1951, + "step": 15800 + }, + { + "epoch": 0.252816, + "grad_norm": 0.62890625, + "learning_rate": 7.532258064516129e-05, + "loss": 0.1706, + "step": 15801 + }, + { + "epoch": 0.252832, + "grad_norm": 0.76171875, + "learning_rate": 7.532096774193549e-05, + "loss": 0.175, + "step": 15802 + }, + { + "epoch": 0.252848, + "grad_norm": 0.953125, + "learning_rate": 7.531935483870967e-05, + "loss": 0.203, + "step": 15803 + }, + { + "epoch": 0.252864, + "grad_norm": 0.64453125, + "learning_rate": 7.531774193548387e-05, + "loss": 0.1818, + "step": 15804 + }, + { + "epoch": 0.25288, + "grad_norm": 0.9296875, + "learning_rate": 7.531612903225806e-05, + "loss": 0.1363, + "step": 15805 + }, + { + "epoch": 0.252896, + "grad_norm": 1.09375, + "learning_rate": 7.531451612903226e-05, + "loss": 0.1655, + "step": 15806 + }, + { + "epoch": 0.252912, + "grad_norm": 0.85546875, + "learning_rate": 7.531290322580646e-05, + "loss": 0.2114, + "step": 15807 + }, + { + "epoch": 0.252928, + "grad_norm": 0.625, + "learning_rate": 7.531129032258066e-05, + "loss": 0.1459, + "step": 15808 + }, + { + "epoch": 0.252944, + "grad_norm": 0.97265625, + "learning_rate": 7.530967741935484e-05, + "loss": 0.1725, + "step": 15809 + }, + { + "epoch": 0.25296, + "grad_norm": 0.83984375, + "learning_rate": 7.530806451612904e-05, + "loss": 0.1681, + "step": 15810 + }, + { + "epoch": 0.252976, + "grad_norm": 1.0703125, + "learning_rate": 7.530645161290323e-05, + "loss": 0.1794, + "step": 15811 + }, + { + "epoch": 0.252992, + "grad_norm": 0.6484375, + "learning_rate": 7.530483870967743e-05, + "loss": 0.1559, + "step": 15812 + }, + { + "epoch": 0.253008, + "grad_norm": 1.390625, + "learning_rate": 7.530322580645162e-05, + "loss": 0.1666, + "step": 15813 + }, + { + "epoch": 0.253024, + "grad_norm": 0.421875, + "learning_rate": 7.530161290322581e-05, + "loss": 0.1481, + "step": 15814 + }, + { + "epoch": 0.25304, + "grad_norm": 0.7109375, + "learning_rate": 7.53e-05, + "loss": 0.1931, + "step": 15815 + }, + { + "epoch": 0.253056, + "grad_norm": 0.71484375, + "learning_rate": 7.529838709677419e-05, + "loss": 0.198, + "step": 15816 + }, + { + "epoch": 0.253072, + "grad_norm": 0.77734375, + "learning_rate": 7.529677419354839e-05, + "loss": 0.1789, + "step": 15817 + }, + { + "epoch": 0.253088, + "grad_norm": 0.81640625, + "learning_rate": 7.529516129032259e-05, + "loss": 0.2342, + "step": 15818 + }, + { + "epoch": 0.253104, + "grad_norm": 0.69140625, + "learning_rate": 7.529354838709679e-05, + "loss": 0.1922, + "step": 15819 + }, + { + "epoch": 0.25312, + "grad_norm": 0.71875, + "learning_rate": 7.529193548387097e-05, + "loss": 0.1637, + "step": 15820 + }, + { + "epoch": 0.253136, + "grad_norm": 1.3984375, + "learning_rate": 7.529032258064517e-05, + "loss": 0.1838, + "step": 15821 + }, + { + "epoch": 0.253152, + "grad_norm": 0.69921875, + "learning_rate": 7.528870967741936e-05, + "loss": 0.1783, + "step": 15822 + }, + { + "epoch": 0.253168, + "grad_norm": 0.484375, + "learning_rate": 7.528709677419356e-05, + "loss": 0.1551, + "step": 15823 + }, + { + "epoch": 0.253184, + "grad_norm": 0.78125, + "learning_rate": 7.528548387096774e-05, + "loss": 0.231, + "step": 15824 + }, + { + "epoch": 0.2532, + "grad_norm": 1.2890625, + "learning_rate": 7.528387096774194e-05, + "loss": 0.1915, + "step": 15825 + }, + { + "epoch": 0.253216, + "grad_norm": 0.78125, + "learning_rate": 7.528225806451613e-05, + "loss": 0.1753, + "step": 15826 + }, + { + "epoch": 0.253232, + "grad_norm": 0.97265625, + "learning_rate": 7.528064516129033e-05, + "loss": 0.2345, + "step": 15827 + }, + { + "epoch": 0.253248, + "grad_norm": 0.90625, + "learning_rate": 7.527903225806451e-05, + "loss": 0.2114, + "step": 15828 + }, + { + "epoch": 0.253264, + "grad_norm": 0.67578125, + "learning_rate": 7.527741935483871e-05, + "loss": 0.1637, + "step": 15829 + }, + { + "epoch": 0.25328, + "grad_norm": 0.69921875, + "learning_rate": 7.52758064516129e-05, + "loss": 0.1468, + "step": 15830 + }, + { + "epoch": 0.253296, + "grad_norm": 0.6953125, + "learning_rate": 7.52741935483871e-05, + "loss": 0.1395, + "step": 15831 + }, + { + "epoch": 0.253312, + "grad_norm": 0.56640625, + "learning_rate": 7.52725806451613e-05, + "loss": 0.1481, + "step": 15832 + }, + { + "epoch": 0.253328, + "grad_norm": 0.94921875, + "learning_rate": 7.527096774193549e-05, + "loss": 0.1561, + "step": 15833 + }, + { + "epoch": 0.253344, + "grad_norm": 0.87109375, + "learning_rate": 7.526935483870969e-05, + "loss": 0.1765, + "step": 15834 + }, + { + "epoch": 0.25336, + "grad_norm": 1.015625, + "learning_rate": 7.526774193548387e-05, + "loss": 0.2098, + "step": 15835 + }, + { + "epoch": 0.253376, + "grad_norm": 0.66015625, + "learning_rate": 7.526612903225807e-05, + "loss": 0.1466, + "step": 15836 + }, + { + "epoch": 0.253392, + "grad_norm": 0.87109375, + "learning_rate": 7.526451612903226e-05, + "loss": 0.198, + "step": 15837 + }, + { + "epoch": 0.253408, + "grad_norm": 0.796875, + "learning_rate": 7.526290322580646e-05, + "loss": 0.1792, + "step": 15838 + }, + { + "epoch": 0.253424, + "grad_norm": 0.59375, + "learning_rate": 7.526129032258064e-05, + "loss": 0.1544, + "step": 15839 + }, + { + "epoch": 0.25344, + "grad_norm": 0.71484375, + "learning_rate": 7.525967741935484e-05, + "loss": 0.1393, + "step": 15840 + }, + { + "epoch": 0.253456, + "grad_norm": 1.28125, + "learning_rate": 7.525806451612903e-05, + "loss": 0.1757, + "step": 15841 + }, + { + "epoch": 0.253472, + "grad_norm": 0.95703125, + "learning_rate": 7.525645161290323e-05, + "loss": 0.2267, + "step": 15842 + }, + { + "epoch": 0.253488, + "grad_norm": 1.078125, + "learning_rate": 7.525483870967743e-05, + "loss": 0.1561, + "step": 15843 + }, + { + "epoch": 0.253504, + "grad_norm": 0.83203125, + "learning_rate": 7.525322580645163e-05, + "loss": 0.1618, + "step": 15844 + }, + { + "epoch": 0.25352, + "grad_norm": 0.8671875, + "learning_rate": 7.525161290322581e-05, + "loss": 0.23, + "step": 15845 + }, + { + "epoch": 0.253536, + "grad_norm": 0.9609375, + "learning_rate": 7.525e-05, + "loss": 0.2046, + "step": 15846 + }, + { + "epoch": 0.253552, + "grad_norm": 0.8046875, + "learning_rate": 7.52483870967742e-05, + "loss": 0.1803, + "step": 15847 + }, + { + "epoch": 0.253568, + "grad_norm": 0.78515625, + "learning_rate": 7.524677419354839e-05, + "loss": 0.1353, + "step": 15848 + }, + { + "epoch": 0.253584, + "grad_norm": 0.75390625, + "learning_rate": 7.524516129032258e-05, + "loss": 0.1719, + "step": 15849 + }, + { + "epoch": 0.2536, + "grad_norm": 1.5703125, + "learning_rate": 7.524354838709677e-05, + "loss": 0.2374, + "step": 15850 + }, + { + "epoch": 0.253616, + "grad_norm": 0.69140625, + "learning_rate": 7.524193548387097e-05, + "loss": 0.1549, + "step": 15851 + }, + { + "epoch": 0.253632, + "grad_norm": 0.46875, + "learning_rate": 7.524032258064516e-05, + "loss": 0.1439, + "step": 15852 + }, + { + "epoch": 0.253648, + "grad_norm": 1.25, + "learning_rate": 7.523870967741936e-05, + "loss": 0.1928, + "step": 15853 + }, + { + "epoch": 0.253664, + "grad_norm": 0.96484375, + "learning_rate": 7.523709677419356e-05, + "loss": 0.177, + "step": 15854 + }, + { + "epoch": 0.25368, + "grad_norm": 0.625, + "learning_rate": 7.523548387096776e-05, + "loss": 0.1806, + "step": 15855 + }, + { + "epoch": 0.253696, + "grad_norm": 0.8359375, + "learning_rate": 7.523387096774194e-05, + "loss": 0.1754, + "step": 15856 + }, + { + "epoch": 0.253712, + "grad_norm": 0.67578125, + "learning_rate": 7.523225806451614e-05, + "loss": 0.1602, + "step": 15857 + }, + { + "epoch": 0.253728, + "grad_norm": 0.76171875, + "learning_rate": 7.523064516129033e-05, + "loss": 0.1537, + "step": 15858 + }, + { + "epoch": 0.253744, + "grad_norm": 0.515625, + "learning_rate": 7.522903225806453e-05, + "loss": 0.1724, + "step": 15859 + }, + { + "epoch": 0.25376, + "grad_norm": 0.6328125, + "learning_rate": 7.522741935483871e-05, + "loss": 0.1583, + "step": 15860 + }, + { + "epoch": 0.253776, + "grad_norm": 0.70703125, + "learning_rate": 7.52258064516129e-05, + "loss": 0.1391, + "step": 15861 + }, + { + "epoch": 0.253792, + "grad_norm": 0.7421875, + "learning_rate": 7.52241935483871e-05, + "loss": 0.164, + "step": 15862 + }, + { + "epoch": 0.253808, + "grad_norm": 0.7109375, + "learning_rate": 7.522258064516128e-05, + "loss": 0.1912, + "step": 15863 + }, + { + "epoch": 0.253824, + "grad_norm": 1.0703125, + "learning_rate": 7.522096774193548e-05, + "loss": 0.1701, + "step": 15864 + }, + { + "epoch": 0.25384, + "grad_norm": 0.5859375, + "learning_rate": 7.521935483870967e-05, + "loss": 0.1468, + "step": 15865 + }, + { + "epoch": 0.253856, + "grad_norm": 0.6640625, + "learning_rate": 7.521774193548387e-05, + "loss": 0.1422, + "step": 15866 + }, + { + "epoch": 0.253872, + "grad_norm": 0.86328125, + "learning_rate": 7.521612903225807e-05, + "loss": 0.1911, + "step": 15867 + }, + { + "epoch": 0.253888, + "grad_norm": 0.765625, + "learning_rate": 7.521451612903227e-05, + "loss": 0.2003, + "step": 15868 + }, + { + "epoch": 0.253904, + "grad_norm": 0.59765625, + "learning_rate": 7.521290322580646e-05, + "loss": 0.1382, + "step": 15869 + }, + { + "epoch": 0.25392, + "grad_norm": 0.8515625, + "learning_rate": 7.521129032258066e-05, + "loss": 0.1871, + "step": 15870 + }, + { + "epoch": 0.253936, + "grad_norm": 0.56640625, + "learning_rate": 7.520967741935484e-05, + "loss": 0.1497, + "step": 15871 + }, + { + "epoch": 0.253952, + "grad_norm": 0.81640625, + "learning_rate": 7.520806451612904e-05, + "loss": 0.1664, + "step": 15872 + }, + { + "epoch": 0.253968, + "grad_norm": 0.57421875, + "learning_rate": 7.520645161290323e-05, + "loss": 0.1703, + "step": 15873 + }, + { + "epoch": 0.253984, + "grad_norm": 1.015625, + "learning_rate": 7.520483870967743e-05, + "loss": 0.1976, + "step": 15874 + }, + { + "epoch": 0.254, + "grad_norm": 1.4453125, + "learning_rate": 7.520322580645161e-05, + "loss": 0.1556, + "step": 15875 + }, + { + "epoch": 0.254016, + "grad_norm": 1.1640625, + "learning_rate": 7.520161290322581e-05, + "loss": 0.2276, + "step": 15876 + }, + { + "epoch": 0.254032, + "grad_norm": 0.76953125, + "learning_rate": 7.52e-05, + "loss": 0.154, + "step": 15877 + }, + { + "epoch": 0.254048, + "grad_norm": 1.328125, + "learning_rate": 7.51983870967742e-05, + "loss": 0.1714, + "step": 15878 + }, + { + "epoch": 0.254064, + "grad_norm": 0.625, + "learning_rate": 7.51967741935484e-05, + "loss": 0.1394, + "step": 15879 + }, + { + "epoch": 0.25408, + "grad_norm": 1.03125, + "learning_rate": 7.519516129032258e-05, + "loss": 0.1537, + "step": 15880 + }, + { + "epoch": 0.254096, + "grad_norm": 0.6640625, + "learning_rate": 7.519354838709678e-05, + "loss": 0.1743, + "step": 15881 + }, + { + "epoch": 0.254112, + "grad_norm": 0.734375, + "learning_rate": 7.519193548387097e-05, + "loss": 0.2044, + "step": 15882 + }, + { + "epoch": 0.254128, + "grad_norm": 0.953125, + "learning_rate": 7.519032258064517e-05, + "loss": 0.1628, + "step": 15883 + }, + { + "epoch": 0.254144, + "grad_norm": 0.72265625, + "learning_rate": 7.518870967741936e-05, + "loss": 0.191, + "step": 15884 + }, + { + "epoch": 0.25416, + "grad_norm": 0.6796875, + "learning_rate": 7.518709677419355e-05, + "loss": 0.1658, + "step": 15885 + }, + { + "epoch": 0.254176, + "grad_norm": 0.625, + "learning_rate": 7.518548387096774e-05, + "loss": 0.1652, + "step": 15886 + }, + { + "epoch": 0.254192, + "grad_norm": 0.5703125, + "learning_rate": 7.518387096774194e-05, + "loss": 0.1271, + "step": 15887 + }, + { + "epoch": 0.254208, + "grad_norm": 1.265625, + "learning_rate": 7.518225806451613e-05, + "loss": 0.1967, + "step": 15888 + }, + { + "epoch": 0.254224, + "grad_norm": 0.6953125, + "learning_rate": 7.518064516129033e-05, + "loss": 0.1475, + "step": 15889 + }, + { + "epoch": 0.25424, + "grad_norm": 1.015625, + "learning_rate": 7.517903225806453e-05, + "loss": 0.1674, + "step": 15890 + }, + { + "epoch": 0.254256, + "grad_norm": 0.7421875, + "learning_rate": 7.517741935483871e-05, + "loss": 0.1787, + "step": 15891 + }, + { + "epoch": 0.254272, + "grad_norm": 0.82421875, + "learning_rate": 7.517580645161291e-05, + "loss": 0.118, + "step": 15892 + }, + { + "epoch": 0.254288, + "grad_norm": 0.71875, + "learning_rate": 7.51741935483871e-05, + "loss": 0.1707, + "step": 15893 + }, + { + "epoch": 0.254304, + "grad_norm": 1.1484375, + "learning_rate": 7.51725806451613e-05, + "loss": 0.1942, + "step": 15894 + }, + { + "epoch": 0.25432, + "grad_norm": 0.8515625, + "learning_rate": 7.517096774193548e-05, + "loss": 0.2095, + "step": 15895 + }, + { + "epoch": 0.254336, + "grad_norm": 0.8515625, + "learning_rate": 7.516935483870968e-05, + "loss": 0.1947, + "step": 15896 + }, + { + "epoch": 0.254352, + "grad_norm": 1.2578125, + "learning_rate": 7.516774193548387e-05, + "loss": 0.1914, + "step": 15897 + }, + { + "epoch": 0.254368, + "grad_norm": 1.09375, + "learning_rate": 7.516612903225807e-05, + "loss": 0.1552, + "step": 15898 + }, + { + "epoch": 0.254384, + "grad_norm": 0.796875, + "learning_rate": 7.516451612903225e-05, + "loss": 0.1888, + "step": 15899 + }, + { + "epoch": 0.2544, + "grad_norm": 0.8828125, + "learning_rate": 7.516290322580645e-05, + "loss": 0.1791, + "step": 15900 + }, + { + "epoch": 0.254416, + "grad_norm": 1.046875, + "learning_rate": 7.516129032258064e-05, + "loss": 0.1947, + "step": 15901 + }, + { + "epoch": 0.254432, + "grad_norm": 1.078125, + "learning_rate": 7.515967741935484e-05, + "loss": 0.2067, + "step": 15902 + }, + { + "epoch": 0.254448, + "grad_norm": 0.6953125, + "learning_rate": 7.515806451612904e-05, + "loss": 0.1737, + "step": 15903 + }, + { + "epoch": 0.254464, + "grad_norm": 0.6484375, + "learning_rate": 7.515645161290324e-05, + "loss": 0.158, + "step": 15904 + }, + { + "epoch": 0.25448, + "grad_norm": 1.2578125, + "learning_rate": 7.515483870967743e-05, + "loss": 0.1624, + "step": 15905 + }, + { + "epoch": 0.254496, + "grad_norm": 0.8046875, + "learning_rate": 7.515322580645162e-05, + "loss": 0.203, + "step": 15906 + }, + { + "epoch": 0.254512, + "grad_norm": 0.734375, + "learning_rate": 7.515161290322581e-05, + "loss": 0.1544, + "step": 15907 + }, + { + "epoch": 0.254528, + "grad_norm": 0.70703125, + "learning_rate": 7.515e-05, + "loss": 0.1972, + "step": 15908 + }, + { + "epoch": 0.254544, + "grad_norm": 0.92578125, + "learning_rate": 7.51483870967742e-05, + "loss": 0.1585, + "step": 15909 + }, + { + "epoch": 0.25456, + "grad_norm": 0.60546875, + "learning_rate": 7.514677419354838e-05, + "loss": 0.1356, + "step": 15910 + }, + { + "epoch": 0.254576, + "grad_norm": 0.8515625, + "learning_rate": 7.514516129032258e-05, + "loss": 0.1677, + "step": 15911 + }, + { + "epoch": 0.254592, + "grad_norm": 0.8203125, + "learning_rate": 7.514354838709677e-05, + "loss": 0.1861, + "step": 15912 + }, + { + "epoch": 0.254608, + "grad_norm": 0.8515625, + "learning_rate": 7.514193548387097e-05, + "loss": 0.2374, + "step": 15913 + }, + { + "epoch": 0.254624, + "grad_norm": 1.015625, + "learning_rate": 7.514032258064517e-05, + "loss": 0.1633, + "step": 15914 + }, + { + "epoch": 0.25464, + "grad_norm": 0.6875, + "learning_rate": 7.513870967741937e-05, + "loss": 0.1869, + "step": 15915 + }, + { + "epoch": 0.254656, + "grad_norm": 0.94140625, + "learning_rate": 7.513709677419355e-05, + "loss": 0.1947, + "step": 15916 + }, + { + "epoch": 0.254672, + "grad_norm": 0.8046875, + "learning_rate": 7.513548387096775e-05, + "loss": 0.1937, + "step": 15917 + }, + { + "epoch": 0.254688, + "grad_norm": 0.73046875, + "learning_rate": 7.513387096774194e-05, + "loss": 0.1545, + "step": 15918 + }, + { + "epoch": 0.254704, + "grad_norm": 0.6328125, + "learning_rate": 7.513225806451614e-05, + "loss": 0.157, + "step": 15919 + }, + { + "epoch": 0.25472, + "grad_norm": 1.0625, + "learning_rate": 7.513064516129032e-05, + "loss": 0.1743, + "step": 15920 + }, + { + "epoch": 0.254736, + "grad_norm": 0.73046875, + "learning_rate": 7.512903225806452e-05, + "loss": 0.1761, + "step": 15921 + }, + { + "epoch": 0.254752, + "grad_norm": 0.734375, + "learning_rate": 7.512741935483871e-05, + "loss": 0.1559, + "step": 15922 + }, + { + "epoch": 0.254768, + "grad_norm": 0.92578125, + "learning_rate": 7.512580645161291e-05, + "loss": 0.176, + "step": 15923 + }, + { + "epoch": 0.254784, + "grad_norm": 0.76171875, + "learning_rate": 7.51241935483871e-05, + "loss": 0.1617, + "step": 15924 + }, + { + "epoch": 0.2548, + "grad_norm": 1.1171875, + "learning_rate": 7.512258064516128e-05, + "loss": 0.1821, + "step": 15925 + }, + { + "epoch": 0.254816, + "grad_norm": 0.94921875, + "learning_rate": 7.512096774193548e-05, + "loss": 0.2351, + "step": 15926 + }, + { + "epoch": 0.254832, + "grad_norm": 0.66015625, + "learning_rate": 7.511935483870968e-05, + "loss": 0.1643, + "step": 15927 + }, + { + "epoch": 0.254848, + "grad_norm": 1.2578125, + "learning_rate": 7.511774193548388e-05, + "loss": 0.2188, + "step": 15928 + }, + { + "epoch": 0.254864, + "grad_norm": 1.1015625, + "learning_rate": 7.511612903225807e-05, + "loss": 0.1525, + "step": 15929 + }, + { + "epoch": 0.25488, + "grad_norm": 0.8671875, + "learning_rate": 7.511451612903227e-05, + "loss": 0.1492, + "step": 15930 + }, + { + "epoch": 0.254896, + "grad_norm": 0.71484375, + "learning_rate": 7.511290322580645e-05, + "loss": 0.1543, + "step": 15931 + }, + { + "epoch": 0.254912, + "grad_norm": 0.66015625, + "learning_rate": 7.511129032258065e-05, + "loss": 0.1824, + "step": 15932 + }, + { + "epoch": 0.254928, + "grad_norm": 0.5703125, + "learning_rate": 7.510967741935484e-05, + "loss": 0.1723, + "step": 15933 + }, + { + "epoch": 0.254944, + "grad_norm": 0.91015625, + "learning_rate": 7.510806451612904e-05, + "loss": 0.1552, + "step": 15934 + }, + { + "epoch": 0.25496, + "grad_norm": 0.7421875, + "learning_rate": 7.510645161290322e-05, + "loss": 0.1744, + "step": 15935 + }, + { + "epoch": 0.254976, + "grad_norm": 1.109375, + "learning_rate": 7.510483870967742e-05, + "loss": 0.2097, + "step": 15936 + }, + { + "epoch": 0.254992, + "grad_norm": 0.68359375, + "learning_rate": 7.510322580645161e-05, + "loss": 0.1622, + "step": 15937 + }, + { + "epoch": 0.255008, + "grad_norm": 1.1015625, + "learning_rate": 7.510161290322581e-05, + "loss": 0.1396, + "step": 15938 + }, + { + "epoch": 0.255024, + "grad_norm": 0.6953125, + "learning_rate": 7.510000000000001e-05, + "loss": 0.1573, + "step": 15939 + }, + { + "epoch": 0.25504, + "grad_norm": 0.8203125, + "learning_rate": 7.50983870967742e-05, + "loss": 0.2079, + "step": 15940 + }, + { + "epoch": 0.255056, + "grad_norm": 0.96484375, + "learning_rate": 7.50967741935484e-05, + "loss": 0.1605, + "step": 15941 + }, + { + "epoch": 0.255072, + "grad_norm": 0.87890625, + "learning_rate": 7.509516129032258e-05, + "loss": 0.1547, + "step": 15942 + }, + { + "epoch": 0.255088, + "grad_norm": 0.859375, + "learning_rate": 7.509354838709678e-05, + "loss": 0.1759, + "step": 15943 + }, + { + "epoch": 0.255104, + "grad_norm": 0.55859375, + "learning_rate": 7.509193548387097e-05, + "loss": 0.1352, + "step": 15944 + }, + { + "epoch": 0.25512, + "grad_norm": 0.6171875, + "learning_rate": 7.509032258064517e-05, + "loss": 0.1576, + "step": 15945 + }, + { + "epoch": 0.255136, + "grad_norm": 0.73828125, + "learning_rate": 7.508870967741935e-05, + "loss": 0.1912, + "step": 15946 + }, + { + "epoch": 0.255152, + "grad_norm": 0.625, + "learning_rate": 7.508709677419355e-05, + "loss": 0.1953, + "step": 15947 + }, + { + "epoch": 0.255168, + "grad_norm": 0.8828125, + "learning_rate": 7.508548387096774e-05, + "loss": 0.2028, + "step": 15948 + }, + { + "epoch": 0.255184, + "grad_norm": 0.80078125, + "learning_rate": 7.508387096774194e-05, + "loss": 0.1753, + "step": 15949 + }, + { + "epoch": 0.2552, + "grad_norm": 0.6875, + "learning_rate": 7.508225806451614e-05, + "loss": 0.1284, + "step": 15950 + }, + { + "epoch": 0.255216, + "grad_norm": 0.9375, + "learning_rate": 7.508064516129034e-05, + "loss": 0.2011, + "step": 15951 + }, + { + "epoch": 0.255232, + "grad_norm": 0.76953125, + "learning_rate": 7.507903225806452e-05, + "loss": 0.1374, + "step": 15952 + }, + { + "epoch": 0.255248, + "grad_norm": 0.60546875, + "learning_rate": 7.507741935483872e-05, + "loss": 0.1547, + "step": 15953 + }, + { + "epoch": 0.255264, + "grad_norm": 0.9609375, + "learning_rate": 7.507580645161291e-05, + "loss": 0.1389, + "step": 15954 + }, + { + "epoch": 0.25528, + "grad_norm": 0.859375, + "learning_rate": 7.50741935483871e-05, + "loss": 0.2024, + "step": 15955 + }, + { + "epoch": 0.255296, + "grad_norm": 0.703125, + "learning_rate": 7.50725806451613e-05, + "loss": 0.1496, + "step": 15956 + }, + { + "epoch": 0.255312, + "grad_norm": 0.61328125, + "learning_rate": 7.507096774193548e-05, + "loss": 0.1766, + "step": 15957 + }, + { + "epoch": 0.255328, + "grad_norm": 0.5703125, + "learning_rate": 7.506935483870968e-05, + "loss": 0.1651, + "step": 15958 + }, + { + "epoch": 0.255344, + "grad_norm": 0.97265625, + "learning_rate": 7.506774193548387e-05, + "loss": 0.1812, + "step": 15959 + }, + { + "epoch": 0.25536, + "grad_norm": 0.81640625, + "learning_rate": 7.506612903225807e-05, + "loss": 0.1748, + "step": 15960 + }, + { + "epoch": 0.255376, + "grad_norm": 0.66796875, + "learning_rate": 7.506451612903225e-05, + "loss": 0.1546, + "step": 15961 + }, + { + "epoch": 0.255392, + "grad_norm": 0.66796875, + "learning_rate": 7.506290322580645e-05, + "loss": 0.177, + "step": 15962 + }, + { + "epoch": 0.255408, + "grad_norm": 0.73828125, + "learning_rate": 7.506129032258065e-05, + "loss": 0.1515, + "step": 15963 + }, + { + "epoch": 0.255424, + "grad_norm": 0.75390625, + "learning_rate": 7.505967741935485e-05, + "loss": 0.1875, + "step": 15964 + }, + { + "epoch": 0.25544, + "grad_norm": 0.7421875, + "learning_rate": 7.505806451612904e-05, + "loss": 0.1795, + "step": 15965 + }, + { + "epoch": 0.255456, + "grad_norm": 0.7421875, + "learning_rate": 7.505645161290324e-05, + "loss": 0.2144, + "step": 15966 + }, + { + "epoch": 0.255472, + "grad_norm": 0.89453125, + "learning_rate": 7.505483870967742e-05, + "loss": 0.1994, + "step": 15967 + }, + { + "epoch": 0.255488, + "grad_norm": 0.66796875, + "learning_rate": 7.505322580645162e-05, + "loss": 0.1675, + "step": 15968 + }, + { + "epoch": 0.255504, + "grad_norm": 0.828125, + "learning_rate": 7.505161290322581e-05, + "loss": 0.1703, + "step": 15969 + }, + { + "epoch": 0.25552, + "grad_norm": 0.88671875, + "learning_rate": 7.505e-05, + "loss": 0.1598, + "step": 15970 + }, + { + "epoch": 0.255536, + "grad_norm": 0.83203125, + "learning_rate": 7.50483870967742e-05, + "loss": 0.1523, + "step": 15971 + }, + { + "epoch": 0.255552, + "grad_norm": 0.640625, + "learning_rate": 7.504677419354838e-05, + "loss": 0.1732, + "step": 15972 + }, + { + "epoch": 0.255568, + "grad_norm": 0.90234375, + "learning_rate": 7.504516129032258e-05, + "loss": 0.1829, + "step": 15973 + }, + { + "epoch": 0.255584, + "grad_norm": 0.65234375, + "learning_rate": 7.504354838709678e-05, + "loss": 0.1306, + "step": 15974 + }, + { + "epoch": 0.2556, + "grad_norm": 0.51171875, + "learning_rate": 7.504193548387098e-05, + "loss": 0.1601, + "step": 15975 + }, + { + "epoch": 0.255616, + "grad_norm": 0.8203125, + "learning_rate": 7.504032258064517e-05, + "loss": 0.1514, + "step": 15976 + }, + { + "epoch": 0.255632, + "grad_norm": 1.390625, + "learning_rate": 7.503870967741936e-05, + "loss": 0.1283, + "step": 15977 + }, + { + "epoch": 0.255648, + "grad_norm": 0.57421875, + "learning_rate": 7.503709677419355e-05, + "loss": 0.1578, + "step": 15978 + }, + { + "epoch": 0.255664, + "grad_norm": 1.140625, + "learning_rate": 7.503548387096775e-05, + "loss": 0.231, + "step": 15979 + }, + { + "epoch": 0.25568, + "grad_norm": 0.84375, + "learning_rate": 7.503387096774194e-05, + "loss": 0.1591, + "step": 15980 + }, + { + "epoch": 0.255696, + "grad_norm": 0.6796875, + "learning_rate": 7.503225806451614e-05, + "loss": 0.1618, + "step": 15981 + }, + { + "epoch": 0.255712, + "grad_norm": 0.6875, + "learning_rate": 7.503064516129032e-05, + "loss": 0.146, + "step": 15982 + }, + { + "epoch": 0.255728, + "grad_norm": 0.9921875, + "learning_rate": 7.502903225806452e-05, + "loss": 0.1665, + "step": 15983 + }, + { + "epoch": 0.255744, + "grad_norm": 0.6484375, + "learning_rate": 7.502741935483871e-05, + "loss": 0.165, + "step": 15984 + }, + { + "epoch": 0.25576, + "grad_norm": 1.0078125, + "learning_rate": 7.502580645161291e-05, + "loss": 0.1988, + "step": 15985 + }, + { + "epoch": 0.255776, + "grad_norm": 1.71875, + "learning_rate": 7.50241935483871e-05, + "loss": 0.1507, + "step": 15986 + }, + { + "epoch": 0.255792, + "grad_norm": 0.71484375, + "learning_rate": 7.50225806451613e-05, + "loss": 0.191, + "step": 15987 + }, + { + "epoch": 0.255808, + "grad_norm": 0.96875, + "learning_rate": 7.502096774193549e-05, + "loss": 0.1878, + "step": 15988 + }, + { + "epoch": 0.255824, + "grad_norm": 0.96484375, + "learning_rate": 7.501935483870968e-05, + "loss": 0.1722, + "step": 15989 + }, + { + "epoch": 0.25584, + "grad_norm": 0.94921875, + "learning_rate": 7.501774193548388e-05, + "loss": 0.1638, + "step": 15990 + }, + { + "epoch": 0.255856, + "grad_norm": 0.65234375, + "learning_rate": 7.501612903225806e-05, + "loss": 0.141, + "step": 15991 + }, + { + "epoch": 0.255872, + "grad_norm": 1.0, + "learning_rate": 7.501451612903226e-05, + "loss": 0.1424, + "step": 15992 + }, + { + "epoch": 0.255888, + "grad_norm": 0.90234375, + "learning_rate": 7.501290322580645e-05, + "loss": 0.204, + "step": 15993 + }, + { + "epoch": 0.255904, + "grad_norm": 0.734375, + "learning_rate": 7.501129032258065e-05, + "loss": 0.161, + "step": 15994 + }, + { + "epoch": 0.25592, + "grad_norm": 0.8046875, + "learning_rate": 7.500967741935484e-05, + "loss": 0.1849, + "step": 15995 + }, + { + "epoch": 0.255936, + "grad_norm": 0.9140625, + "learning_rate": 7.500806451612904e-05, + "loss": 0.1691, + "step": 15996 + }, + { + "epoch": 0.255952, + "grad_norm": 0.6796875, + "learning_rate": 7.500645161290322e-05, + "loss": 0.1935, + "step": 15997 + }, + { + "epoch": 0.255968, + "grad_norm": 0.73046875, + "learning_rate": 7.500483870967742e-05, + "loss": 0.1533, + "step": 15998 + }, + { + "epoch": 0.255984, + "grad_norm": 0.5703125, + "learning_rate": 7.500322580645162e-05, + "loss": 0.1439, + "step": 15999 + }, + { + "epoch": 0.256, + "grad_norm": 0.6015625, + "learning_rate": 7.500161290322582e-05, + "loss": 0.1721, + "step": 16000 + }, + { + "epoch": 0.256016, + "grad_norm": 1.09375, + "learning_rate": 7.500000000000001e-05, + "loss": 0.1815, + "step": 16001 + }, + { + "epoch": 0.256032, + "grad_norm": 0.78515625, + "learning_rate": 7.499838709677419e-05, + "loss": 0.1863, + "step": 16002 + }, + { + "epoch": 0.256048, + "grad_norm": 0.89453125, + "learning_rate": 7.499677419354839e-05, + "loss": 0.1893, + "step": 16003 + }, + { + "epoch": 0.256064, + "grad_norm": 0.9140625, + "learning_rate": 7.499516129032258e-05, + "loss": 0.1902, + "step": 16004 + }, + { + "epoch": 0.25608, + "grad_norm": 1.046875, + "learning_rate": 7.499354838709678e-05, + "loss": 0.1812, + "step": 16005 + }, + { + "epoch": 0.256096, + "grad_norm": 0.77734375, + "learning_rate": 7.499193548387096e-05, + "loss": 0.1761, + "step": 16006 + }, + { + "epoch": 0.256112, + "grad_norm": 0.6328125, + "learning_rate": 7.499032258064516e-05, + "loss": 0.1343, + "step": 16007 + }, + { + "epoch": 0.256128, + "grad_norm": 0.81640625, + "learning_rate": 7.498870967741935e-05, + "loss": 0.1426, + "step": 16008 + }, + { + "epoch": 0.256144, + "grad_norm": 0.71875, + "learning_rate": 7.498709677419355e-05, + "loss": 0.1281, + "step": 16009 + }, + { + "epoch": 0.25616, + "grad_norm": 0.6953125, + "learning_rate": 7.498548387096775e-05, + "loss": 0.1539, + "step": 16010 + }, + { + "epoch": 0.256176, + "grad_norm": 0.7578125, + "learning_rate": 7.498387096774195e-05, + "loss": 0.1489, + "step": 16011 + }, + { + "epoch": 0.256192, + "grad_norm": 0.57421875, + "learning_rate": 7.498225806451614e-05, + "loss": 0.1684, + "step": 16012 + }, + { + "epoch": 0.256208, + "grad_norm": 0.67578125, + "learning_rate": 7.498064516129033e-05, + "loss": 0.1295, + "step": 16013 + }, + { + "epoch": 0.256224, + "grad_norm": 0.80859375, + "learning_rate": 7.497903225806452e-05, + "loss": 0.1542, + "step": 16014 + }, + { + "epoch": 0.25624, + "grad_norm": 1.3828125, + "learning_rate": 7.497741935483872e-05, + "loss": 0.1976, + "step": 16015 + }, + { + "epoch": 0.256256, + "grad_norm": 0.5546875, + "learning_rate": 7.49758064516129e-05, + "loss": 0.1409, + "step": 16016 + }, + { + "epoch": 0.256272, + "grad_norm": 1.1875, + "learning_rate": 7.497419354838709e-05, + "loss": 0.1803, + "step": 16017 + }, + { + "epoch": 0.256288, + "grad_norm": 0.7890625, + "learning_rate": 7.497258064516129e-05, + "loss": 0.1641, + "step": 16018 + }, + { + "epoch": 0.256304, + "grad_norm": 1.8515625, + "learning_rate": 7.497096774193548e-05, + "loss": 0.1947, + "step": 16019 + }, + { + "epoch": 0.25632, + "grad_norm": 0.73046875, + "learning_rate": 7.496935483870968e-05, + "loss": 0.1697, + "step": 16020 + }, + { + "epoch": 0.256336, + "grad_norm": 0.7109375, + "learning_rate": 7.496774193548386e-05, + "loss": 0.1781, + "step": 16021 + }, + { + "epoch": 0.256352, + "grad_norm": 0.59375, + "learning_rate": 7.496612903225806e-05, + "loss": 0.181, + "step": 16022 + }, + { + "epoch": 0.256368, + "grad_norm": 0.7421875, + "learning_rate": 7.496451612903226e-05, + "loss": 0.169, + "step": 16023 + }, + { + "epoch": 0.256384, + "grad_norm": 1.0703125, + "learning_rate": 7.496290322580646e-05, + "loss": 0.1525, + "step": 16024 + }, + { + "epoch": 0.2564, + "grad_norm": 0.6875, + "learning_rate": 7.496129032258065e-05, + "loss": 0.1521, + "step": 16025 + }, + { + "epoch": 0.256416, + "grad_norm": 0.6328125, + "learning_rate": 7.495967741935485e-05, + "loss": 0.1802, + "step": 16026 + }, + { + "epoch": 0.256432, + "grad_norm": 0.7890625, + "learning_rate": 7.495806451612903e-05, + "loss": 0.1908, + "step": 16027 + }, + { + "epoch": 0.256448, + "grad_norm": 0.9921875, + "learning_rate": 7.495645161290323e-05, + "loss": 0.2112, + "step": 16028 + }, + { + "epoch": 0.256464, + "grad_norm": 0.75, + "learning_rate": 7.495483870967742e-05, + "loss": 0.1763, + "step": 16029 + }, + { + "epoch": 0.25648, + "grad_norm": 0.625, + "learning_rate": 7.495322580645162e-05, + "loss": 0.142, + "step": 16030 + }, + { + "epoch": 0.256496, + "grad_norm": 0.69921875, + "learning_rate": 7.49516129032258e-05, + "loss": 0.1534, + "step": 16031 + }, + { + "epoch": 0.256512, + "grad_norm": 1.078125, + "learning_rate": 7.495e-05, + "loss": 0.1851, + "step": 16032 + }, + { + "epoch": 0.256528, + "grad_norm": 0.84765625, + "learning_rate": 7.494838709677419e-05, + "loss": 0.171, + "step": 16033 + }, + { + "epoch": 0.256544, + "grad_norm": 1.2578125, + "learning_rate": 7.494677419354839e-05, + "loss": 0.1083, + "step": 16034 + }, + { + "epoch": 0.25656, + "grad_norm": 0.625, + "learning_rate": 7.494516129032259e-05, + "loss": 0.165, + "step": 16035 + }, + { + "epoch": 0.256576, + "grad_norm": 0.55859375, + "learning_rate": 7.494354838709678e-05, + "loss": 0.1499, + "step": 16036 + }, + { + "epoch": 0.256592, + "grad_norm": 0.6640625, + "learning_rate": 7.494193548387098e-05, + "loss": 0.1594, + "step": 16037 + }, + { + "epoch": 0.256608, + "grad_norm": 0.83203125, + "learning_rate": 7.494032258064516e-05, + "loss": 0.1675, + "step": 16038 + }, + { + "epoch": 0.256624, + "grad_norm": 1.3046875, + "learning_rate": 7.493870967741936e-05, + "loss": 0.1565, + "step": 16039 + }, + { + "epoch": 0.25664, + "grad_norm": 0.8359375, + "learning_rate": 7.493709677419355e-05, + "loss": 0.1662, + "step": 16040 + }, + { + "epoch": 0.256656, + "grad_norm": 0.7265625, + "learning_rate": 7.493548387096775e-05, + "loss": 0.1674, + "step": 16041 + }, + { + "epoch": 0.256672, + "grad_norm": 0.828125, + "learning_rate": 7.493387096774193e-05, + "loss": 0.176, + "step": 16042 + }, + { + "epoch": 0.256688, + "grad_norm": 1.296875, + "learning_rate": 7.493225806451613e-05, + "loss": 0.2028, + "step": 16043 + }, + { + "epoch": 0.256704, + "grad_norm": 1.1171875, + "learning_rate": 7.493064516129032e-05, + "loss": 0.1475, + "step": 16044 + }, + { + "epoch": 0.25672, + "grad_norm": 0.5703125, + "learning_rate": 7.492903225806452e-05, + "loss": 0.1439, + "step": 16045 + }, + { + "epoch": 0.256736, + "grad_norm": 0.7734375, + "learning_rate": 7.492741935483872e-05, + "loss": 0.1608, + "step": 16046 + }, + { + "epoch": 0.256752, + "grad_norm": 0.73046875, + "learning_rate": 7.492580645161292e-05, + "loss": 0.1846, + "step": 16047 + }, + { + "epoch": 0.256768, + "grad_norm": 0.6640625, + "learning_rate": 7.49241935483871e-05, + "loss": 0.1404, + "step": 16048 + }, + { + "epoch": 0.256784, + "grad_norm": 1.0390625, + "learning_rate": 7.492258064516129e-05, + "loss": 0.1689, + "step": 16049 + }, + { + "epoch": 0.2568, + "grad_norm": 0.9921875, + "learning_rate": 7.492096774193549e-05, + "loss": 0.1626, + "step": 16050 + }, + { + "epoch": 0.256816, + "grad_norm": 0.7109375, + "learning_rate": 7.491935483870968e-05, + "loss": 0.1464, + "step": 16051 + }, + { + "epoch": 0.256832, + "grad_norm": 0.61328125, + "learning_rate": 7.491774193548388e-05, + "loss": 0.1474, + "step": 16052 + }, + { + "epoch": 0.256848, + "grad_norm": 0.79296875, + "learning_rate": 7.491612903225806e-05, + "loss": 0.1739, + "step": 16053 + }, + { + "epoch": 0.256864, + "grad_norm": 0.89453125, + "learning_rate": 7.491451612903226e-05, + "loss": 0.1641, + "step": 16054 + }, + { + "epoch": 0.25688, + "grad_norm": 0.6640625, + "learning_rate": 7.491290322580645e-05, + "loss": 0.1496, + "step": 16055 + }, + { + "epoch": 0.256896, + "grad_norm": 0.6171875, + "learning_rate": 7.491129032258065e-05, + "loss": 0.1696, + "step": 16056 + }, + { + "epoch": 0.256912, + "grad_norm": 0.6015625, + "learning_rate": 7.490967741935483e-05, + "loss": 0.1718, + "step": 16057 + }, + { + "epoch": 0.256928, + "grad_norm": 0.66796875, + "learning_rate": 7.490806451612903e-05, + "loss": 0.1616, + "step": 16058 + }, + { + "epoch": 0.256944, + "grad_norm": 0.828125, + "learning_rate": 7.490645161290323e-05, + "loss": 0.1536, + "step": 16059 + }, + { + "epoch": 0.25696, + "grad_norm": 0.6875, + "learning_rate": 7.490483870967743e-05, + "loss": 0.199, + "step": 16060 + }, + { + "epoch": 0.256976, + "grad_norm": 0.92578125, + "learning_rate": 7.490322580645162e-05, + "loss": 0.1651, + "step": 16061 + }, + { + "epoch": 0.256992, + "grad_norm": 0.65234375, + "learning_rate": 7.490161290322582e-05, + "loss": 0.167, + "step": 16062 + }, + { + "epoch": 0.257008, + "grad_norm": 0.90625, + "learning_rate": 7.49e-05, + "loss": 0.1583, + "step": 16063 + }, + { + "epoch": 0.257024, + "grad_norm": 0.62109375, + "learning_rate": 7.489838709677419e-05, + "loss": 0.1553, + "step": 16064 + }, + { + "epoch": 0.25704, + "grad_norm": 0.7578125, + "learning_rate": 7.489677419354839e-05, + "loss": 0.1649, + "step": 16065 + }, + { + "epoch": 0.257056, + "grad_norm": 0.64453125, + "learning_rate": 7.489516129032258e-05, + "loss": 0.1637, + "step": 16066 + }, + { + "epoch": 0.257072, + "grad_norm": 0.62890625, + "learning_rate": 7.489354838709678e-05, + "loss": 0.1568, + "step": 16067 + }, + { + "epoch": 0.257088, + "grad_norm": 1.1015625, + "learning_rate": 7.489193548387096e-05, + "loss": 0.1719, + "step": 16068 + }, + { + "epoch": 0.257104, + "grad_norm": 0.74609375, + "learning_rate": 7.489032258064516e-05, + "loss": 0.1589, + "step": 16069 + }, + { + "epoch": 0.25712, + "grad_norm": 0.9921875, + "learning_rate": 7.488870967741936e-05, + "loss": 0.1979, + "step": 16070 + }, + { + "epoch": 0.257136, + "grad_norm": 1.0234375, + "learning_rate": 7.488709677419356e-05, + "loss": 0.1733, + "step": 16071 + }, + { + "epoch": 0.257152, + "grad_norm": 2.53125, + "learning_rate": 7.488548387096775e-05, + "loss": 0.2144, + "step": 16072 + }, + { + "epoch": 0.257168, + "grad_norm": 0.7109375, + "learning_rate": 7.488387096774195e-05, + "loss": 0.1569, + "step": 16073 + }, + { + "epoch": 0.257184, + "grad_norm": 0.59765625, + "learning_rate": 7.488225806451613e-05, + "loss": 0.1488, + "step": 16074 + }, + { + "epoch": 0.2572, + "grad_norm": 0.8203125, + "learning_rate": 7.488064516129033e-05, + "loss": 0.1488, + "step": 16075 + }, + { + "epoch": 0.257216, + "grad_norm": 0.828125, + "learning_rate": 7.487903225806452e-05, + "loss": 0.1538, + "step": 16076 + }, + { + "epoch": 0.257232, + "grad_norm": 0.921875, + "learning_rate": 7.487741935483872e-05, + "loss": 0.1744, + "step": 16077 + }, + { + "epoch": 0.257248, + "grad_norm": 1.0546875, + "learning_rate": 7.48758064516129e-05, + "loss": 0.1501, + "step": 16078 + }, + { + "epoch": 0.257264, + "grad_norm": 0.5625, + "learning_rate": 7.487419354838709e-05, + "loss": 0.1786, + "step": 16079 + }, + { + "epoch": 0.25728, + "grad_norm": 0.85546875, + "learning_rate": 7.487258064516129e-05, + "loss": 0.1588, + "step": 16080 + }, + { + "epoch": 0.257296, + "grad_norm": 0.80078125, + "learning_rate": 7.487096774193548e-05, + "loss": 0.1576, + "step": 16081 + }, + { + "epoch": 0.257312, + "grad_norm": 0.6875, + "learning_rate": 7.486935483870968e-05, + "loss": 0.1756, + "step": 16082 + }, + { + "epoch": 0.257328, + "grad_norm": 0.97265625, + "learning_rate": 7.486774193548388e-05, + "loss": 0.1812, + "step": 16083 + }, + { + "epoch": 0.257344, + "grad_norm": 1.015625, + "learning_rate": 7.486612903225807e-05, + "loss": 0.2006, + "step": 16084 + }, + { + "epoch": 0.25736, + "grad_norm": 0.734375, + "learning_rate": 7.486451612903226e-05, + "loss": 0.1512, + "step": 16085 + }, + { + "epoch": 0.257376, + "grad_norm": 0.8515625, + "learning_rate": 7.486290322580646e-05, + "loss": 0.154, + "step": 16086 + }, + { + "epoch": 0.257392, + "grad_norm": 0.69921875, + "learning_rate": 7.486129032258065e-05, + "loss": 0.1761, + "step": 16087 + }, + { + "epoch": 0.257408, + "grad_norm": 0.9375, + "learning_rate": 7.485967741935485e-05, + "loss": 0.1827, + "step": 16088 + }, + { + "epoch": 0.257424, + "grad_norm": 0.94921875, + "learning_rate": 7.485806451612903e-05, + "loss": 0.1757, + "step": 16089 + }, + { + "epoch": 0.25744, + "grad_norm": 0.86328125, + "learning_rate": 7.485645161290323e-05, + "loss": 0.1766, + "step": 16090 + }, + { + "epoch": 0.257456, + "grad_norm": 1.0625, + "learning_rate": 7.485483870967742e-05, + "loss": 0.1227, + "step": 16091 + }, + { + "epoch": 0.257472, + "grad_norm": 0.67578125, + "learning_rate": 7.485322580645162e-05, + "loss": 0.1556, + "step": 16092 + }, + { + "epoch": 0.257488, + "grad_norm": 0.640625, + "learning_rate": 7.48516129032258e-05, + "loss": 0.1748, + "step": 16093 + }, + { + "epoch": 0.257504, + "grad_norm": 0.78515625, + "learning_rate": 7.485e-05, + "loss": 0.1858, + "step": 16094 + }, + { + "epoch": 0.25752, + "grad_norm": 0.7421875, + "learning_rate": 7.48483870967742e-05, + "loss": 0.1978, + "step": 16095 + }, + { + "epoch": 0.257536, + "grad_norm": 0.67578125, + "learning_rate": 7.484677419354839e-05, + "loss": 0.1993, + "step": 16096 + }, + { + "epoch": 0.257552, + "grad_norm": 1.1328125, + "learning_rate": 7.484516129032259e-05, + "loss": 0.1475, + "step": 16097 + }, + { + "epoch": 0.257568, + "grad_norm": 0.73046875, + "learning_rate": 7.484354838709677e-05, + "loss": 0.1503, + "step": 16098 + }, + { + "epoch": 0.257584, + "grad_norm": 1.0546875, + "learning_rate": 7.484193548387097e-05, + "loss": 0.2112, + "step": 16099 + }, + { + "epoch": 0.2576, + "grad_norm": 0.71875, + "learning_rate": 7.484032258064516e-05, + "loss": 0.1584, + "step": 16100 + }, + { + "epoch": 0.257616, + "grad_norm": 0.66796875, + "learning_rate": 7.483870967741936e-05, + "loss": 0.1526, + "step": 16101 + }, + { + "epoch": 0.257632, + "grad_norm": 0.60546875, + "learning_rate": 7.483709677419355e-05, + "loss": 0.1427, + "step": 16102 + }, + { + "epoch": 0.257648, + "grad_norm": 0.74609375, + "learning_rate": 7.483548387096775e-05, + "loss": 0.1579, + "step": 16103 + }, + { + "epoch": 0.257664, + "grad_norm": 0.83984375, + "learning_rate": 7.483387096774193e-05, + "loss": 0.1609, + "step": 16104 + }, + { + "epoch": 0.25768, + "grad_norm": 1.515625, + "learning_rate": 7.483225806451613e-05, + "loss": 0.1878, + "step": 16105 + }, + { + "epoch": 0.257696, + "grad_norm": 0.5625, + "learning_rate": 7.483064516129033e-05, + "loss": 0.1657, + "step": 16106 + }, + { + "epoch": 0.257712, + "grad_norm": 0.5625, + "learning_rate": 7.482903225806453e-05, + "loss": 0.149, + "step": 16107 + }, + { + "epoch": 0.257728, + "grad_norm": 1.0625, + "learning_rate": 7.482741935483872e-05, + "loss": 0.2004, + "step": 16108 + }, + { + "epoch": 0.257744, + "grad_norm": 1.0625, + "learning_rate": 7.482580645161292e-05, + "loss": 0.1842, + "step": 16109 + }, + { + "epoch": 0.25776, + "grad_norm": 0.5625, + "learning_rate": 7.48241935483871e-05, + "loss": 0.1478, + "step": 16110 + }, + { + "epoch": 0.257776, + "grad_norm": 0.98046875, + "learning_rate": 7.482258064516129e-05, + "loss": 0.1894, + "step": 16111 + }, + { + "epoch": 0.257792, + "grad_norm": 0.66015625, + "learning_rate": 7.482096774193549e-05, + "loss": 0.161, + "step": 16112 + }, + { + "epoch": 0.257808, + "grad_norm": 0.99609375, + "learning_rate": 7.481935483870967e-05, + "loss": 0.1441, + "step": 16113 + }, + { + "epoch": 0.257824, + "grad_norm": 0.6484375, + "learning_rate": 7.481774193548387e-05, + "loss": 0.1844, + "step": 16114 + }, + { + "epoch": 0.25784, + "grad_norm": 0.8125, + "learning_rate": 7.481612903225806e-05, + "loss": 0.1691, + "step": 16115 + }, + { + "epoch": 0.257856, + "grad_norm": 0.64453125, + "learning_rate": 7.481451612903226e-05, + "loss": 0.172, + "step": 16116 + }, + { + "epoch": 0.257872, + "grad_norm": 1.015625, + "learning_rate": 7.481290322580645e-05, + "loss": 0.1935, + "step": 16117 + }, + { + "epoch": 0.257888, + "grad_norm": 0.84375, + "learning_rate": 7.481129032258065e-05, + "loss": 0.1895, + "step": 16118 + }, + { + "epoch": 0.257904, + "grad_norm": 0.79296875, + "learning_rate": 7.480967741935485e-05, + "loss": 0.1567, + "step": 16119 + }, + { + "epoch": 0.25792, + "grad_norm": 0.59375, + "learning_rate": 7.480806451612904e-05, + "loss": 0.1731, + "step": 16120 + }, + { + "epoch": 0.257936, + "grad_norm": 0.9140625, + "learning_rate": 7.480645161290323e-05, + "loss": 0.1787, + "step": 16121 + }, + { + "epoch": 0.257952, + "grad_norm": 1.1875, + "learning_rate": 7.480483870967743e-05, + "loss": 0.183, + "step": 16122 + }, + { + "epoch": 0.257968, + "grad_norm": 0.90234375, + "learning_rate": 7.480322580645162e-05, + "loss": 0.151, + "step": 16123 + }, + { + "epoch": 0.257984, + "grad_norm": 0.828125, + "learning_rate": 7.480161290322582e-05, + "loss": 0.1471, + "step": 16124 + }, + { + "epoch": 0.258, + "grad_norm": 0.7109375, + "learning_rate": 7.48e-05, + "loss": 0.2019, + "step": 16125 + }, + { + "epoch": 0.258016, + "grad_norm": 0.66015625, + "learning_rate": 7.479838709677419e-05, + "loss": 0.1521, + "step": 16126 + }, + { + "epoch": 0.258032, + "grad_norm": 1.0234375, + "learning_rate": 7.479677419354839e-05, + "loss": 0.1794, + "step": 16127 + }, + { + "epoch": 0.258048, + "grad_norm": 0.5234375, + "learning_rate": 7.479516129032257e-05, + "loss": 0.1222, + "step": 16128 + }, + { + "epoch": 0.258064, + "grad_norm": 1.1640625, + "learning_rate": 7.479354838709677e-05, + "loss": 0.1818, + "step": 16129 + }, + { + "epoch": 0.25808, + "grad_norm": 0.9765625, + "learning_rate": 7.479193548387097e-05, + "loss": 0.2008, + "step": 16130 + }, + { + "epoch": 0.258096, + "grad_norm": 0.5703125, + "learning_rate": 7.479032258064517e-05, + "loss": 0.1648, + "step": 16131 + }, + { + "epoch": 0.258112, + "grad_norm": 0.94140625, + "learning_rate": 7.478870967741936e-05, + "loss": 0.1601, + "step": 16132 + }, + { + "epoch": 0.258128, + "grad_norm": 0.6953125, + "learning_rate": 7.478709677419356e-05, + "loss": 0.164, + "step": 16133 + }, + { + "epoch": 0.258144, + "grad_norm": 0.8515625, + "learning_rate": 7.478548387096774e-05, + "loss": 0.1834, + "step": 16134 + }, + { + "epoch": 0.25816, + "grad_norm": 0.72265625, + "learning_rate": 7.478387096774194e-05, + "loss": 0.1571, + "step": 16135 + }, + { + "epoch": 0.258176, + "grad_norm": 0.83984375, + "learning_rate": 7.478225806451613e-05, + "loss": 0.1428, + "step": 16136 + }, + { + "epoch": 0.258192, + "grad_norm": 0.94140625, + "learning_rate": 7.478064516129033e-05, + "loss": 0.1961, + "step": 16137 + }, + { + "epoch": 0.258208, + "grad_norm": 0.765625, + "learning_rate": 7.477903225806452e-05, + "loss": 0.1921, + "step": 16138 + }, + { + "epoch": 0.258224, + "grad_norm": 0.8046875, + "learning_rate": 7.477741935483872e-05, + "loss": 0.1686, + "step": 16139 + }, + { + "epoch": 0.25824, + "grad_norm": 0.796875, + "learning_rate": 7.47758064516129e-05, + "loss": 0.1778, + "step": 16140 + }, + { + "epoch": 0.258256, + "grad_norm": 1.1328125, + "learning_rate": 7.47741935483871e-05, + "loss": 0.1584, + "step": 16141 + }, + { + "epoch": 0.258272, + "grad_norm": 0.56640625, + "learning_rate": 7.47725806451613e-05, + "loss": 0.1672, + "step": 16142 + }, + { + "epoch": 0.258288, + "grad_norm": 0.6640625, + "learning_rate": 7.477096774193549e-05, + "loss": 0.1552, + "step": 16143 + }, + { + "epoch": 0.258304, + "grad_norm": 0.60546875, + "learning_rate": 7.476935483870969e-05, + "loss": 0.1285, + "step": 16144 + }, + { + "epoch": 0.25832, + "grad_norm": 0.80859375, + "learning_rate": 7.476774193548387e-05, + "loss": 0.1732, + "step": 16145 + }, + { + "epoch": 0.258336, + "grad_norm": 0.7734375, + "learning_rate": 7.476612903225807e-05, + "loss": 0.1724, + "step": 16146 + }, + { + "epoch": 0.258352, + "grad_norm": 0.84765625, + "learning_rate": 7.476451612903226e-05, + "loss": 0.1909, + "step": 16147 + }, + { + "epoch": 0.258368, + "grad_norm": 1.328125, + "learning_rate": 7.476290322580646e-05, + "loss": 0.1883, + "step": 16148 + }, + { + "epoch": 0.258384, + "grad_norm": 0.70703125, + "learning_rate": 7.476129032258064e-05, + "loss": 0.1756, + "step": 16149 + }, + { + "epoch": 0.2584, + "grad_norm": 0.95703125, + "learning_rate": 7.475967741935484e-05, + "loss": 0.1789, + "step": 16150 + }, + { + "epoch": 0.258416, + "grad_norm": 0.55078125, + "learning_rate": 7.475806451612903e-05, + "loss": 0.1932, + "step": 16151 + }, + { + "epoch": 0.258432, + "grad_norm": 0.69921875, + "learning_rate": 7.475645161290323e-05, + "loss": 0.1758, + "step": 16152 + }, + { + "epoch": 0.258448, + "grad_norm": 0.515625, + "learning_rate": 7.475483870967742e-05, + "loss": 0.1173, + "step": 16153 + }, + { + "epoch": 0.258464, + "grad_norm": 1.1640625, + "learning_rate": 7.475322580645162e-05, + "loss": 0.1529, + "step": 16154 + }, + { + "epoch": 0.25848, + "grad_norm": 0.609375, + "learning_rate": 7.475161290322581e-05, + "loss": 0.1813, + "step": 16155 + }, + { + "epoch": 0.258496, + "grad_norm": 0.8984375, + "learning_rate": 7.475000000000001e-05, + "loss": 0.1286, + "step": 16156 + }, + { + "epoch": 0.258512, + "grad_norm": 1.3359375, + "learning_rate": 7.47483870967742e-05, + "loss": 0.207, + "step": 16157 + }, + { + "epoch": 0.258528, + "grad_norm": 0.71875, + "learning_rate": 7.474677419354839e-05, + "loss": 0.1557, + "step": 16158 + }, + { + "epoch": 0.258544, + "grad_norm": 0.91015625, + "learning_rate": 7.474516129032259e-05, + "loss": 0.1493, + "step": 16159 + }, + { + "epoch": 0.25856, + "grad_norm": 0.90625, + "learning_rate": 7.474354838709677e-05, + "loss": 0.1742, + "step": 16160 + }, + { + "epoch": 0.258576, + "grad_norm": 1.34375, + "learning_rate": 7.474193548387097e-05, + "loss": 0.1763, + "step": 16161 + }, + { + "epoch": 0.258592, + "grad_norm": 0.76953125, + "learning_rate": 7.474032258064516e-05, + "loss": 0.1837, + "step": 16162 + }, + { + "epoch": 0.258608, + "grad_norm": 0.89453125, + "learning_rate": 7.473870967741936e-05, + "loss": 0.1575, + "step": 16163 + }, + { + "epoch": 0.258624, + "grad_norm": 0.6484375, + "learning_rate": 7.473709677419354e-05, + "loss": 0.171, + "step": 16164 + }, + { + "epoch": 0.25864, + "grad_norm": 0.61328125, + "learning_rate": 7.473548387096774e-05, + "loss": 0.1852, + "step": 16165 + }, + { + "epoch": 0.258656, + "grad_norm": 0.74609375, + "learning_rate": 7.473387096774194e-05, + "loss": 0.1699, + "step": 16166 + }, + { + "epoch": 0.258672, + "grad_norm": 0.8046875, + "learning_rate": 7.473225806451614e-05, + "loss": 0.1902, + "step": 16167 + }, + { + "epoch": 0.258688, + "grad_norm": 0.71875, + "learning_rate": 7.473064516129033e-05, + "loss": 0.1756, + "step": 16168 + }, + { + "epoch": 0.258704, + "grad_norm": 1.375, + "learning_rate": 7.472903225806453e-05, + "loss": 0.1377, + "step": 16169 + }, + { + "epoch": 0.25872, + "grad_norm": 0.734375, + "learning_rate": 7.472741935483871e-05, + "loss": 0.2068, + "step": 16170 + }, + { + "epoch": 0.258736, + "grad_norm": 0.67578125, + "learning_rate": 7.472580645161291e-05, + "loss": 0.1941, + "step": 16171 + }, + { + "epoch": 0.258752, + "grad_norm": 0.73828125, + "learning_rate": 7.47241935483871e-05, + "loss": 0.1693, + "step": 16172 + }, + { + "epoch": 0.258768, + "grad_norm": 0.6796875, + "learning_rate": 7.472258064516129e-05, + "loss": 0.2054, + "step": 16173 + }, + { + "epoch": 0.258784, + "grad_norm": 0.90234375, + "learning_rate": 7.472096774193549e-05, + "loss": 0.136, + "step": 16174 + }, + { + "epoch": 0.2588, + "grad_norm": 0.82421875, + "learning_rate": 7.471935483870967e-05, + "loss": 0.1963, + "step": 16175 + }, + { + "epoch": 0.258816, + "grad_norm": 0.93359375, + "learning_rate": 7.471774193548387e-05, + "loss": 0.2025, + "step": 16176 + }, + { + "epoch": 0.258832, + "grad_norm": 1.2265625, + "learning_rate": 7.471612903225806e-05, + "loss": 0.1708, + "step": 16177 + }, + { + "epoch": 0.258848, + "grad_norm": 0.58203125, + "learning_rate": 7.471451612903226e-05, + "loss": 0.1594, + "step": 16178 + }, + { + "epoch": 0.258864, + "grad_norm": 1.3359375, + "learning_rate": 7.471290322580646e-05, + "loss": 0.2457, + "step": 16179 + }, + { + "epoch": 0.25888, + "grad_norm": 0.859375, + "learning_rate": 7.471129032258066e-05, + "loss": 0.1819, + "step": 16180 + }, + { + "epoch": 0.258896, + "grad_norm": 0.828125, + "learning_rate": 7.470967741935484e-05, + "loss": 0.1594, + "step": 16181 + }, + { + "epoch": 0.258912, + "grad_norm": 0.765625, + "learning_rate": 7.470806451612904e-05, + "loss": 0.1619, + "step": 16182 + }, + { + "epoch": 0.258928, + "grad_norm": 0.94921875, + "learning_rate": 7.470645161290323e-05, + "loss": 0.1599, + "step": 16183 + }, + { + "epoch": 0.258944, + "grad_norm": 0.62890625, + "learning_rate": 7.470483870967743e-05, + "loss": 0.1622, + "step": 16184 + }, + { + "epoch": 0.25896, + "grad_norm": 0.65234375, + "learning_rate": 7.470322580645161e-05, + "loss": 0.1771, + "step": 16185 + }, + { + "epoch": 0.258976, + "grad_norm": 0.78125, + "learning_rate": 7.470161290322581e-05, + "loss": 0.1536, + "step": 16186 + }, + { + "epoch": 0.258992, + "grad_norm": 1.09375, + "learning_rate": 7.47e-05, + "loss": 0.1315, + "step": 16187 + }, + { + "epoch": 0.259008, + "grad_norm": 0.765625, + "learning_rate": 7.469838709677419e-05, + "loss": 0.1711, + "step": 16188 + }, + { + "epoch": 0.259024, + "grad_norm": 0.94921875, + "learning_rate": 7.469677419354839e-05, + "loss": 0.1744, + "step": 16189 + }, + { + "epoch": 0.25904, + "grad_norm": 1.046875, + "learning_rate": 7.469516129032259e-05, + "loss": 0.2508, + "step": 16190 + }, + { + "epoch": 0.259056, + "grad_norm": 1.0390625, + "learning_rate": 7.469354838709678e-05, + "loss": 0.2024, + "step": 16191 + }, + { + "epoch": 0.259072, + "grad_norm": 0.80078125, + "learning_rate": 7.469193548387097e-05, + "loss": 0.2018, + "step": 16192 + }, + { + "epoch": 0.259088, + "grad_norm": 0.765625, + "learning_rate": 7.469032258064517e-05, + "loss": 0.2219, + "step": 16193 + }, + { + "epoch": 0.259104, + "grad_norm": 0.7421875, + "learning_rate": 7.468870967741936e-05, + "loss": 0.178, + "step": 16194 + }, + { + "epoch": 0.25912, + "grad_norm": 0.75, + "learning_rate": 7.468709677419356e-05, + "loss": 0.1572, + "step": 16195 + }, + { + "epoch": 0.259136, + "grad_norm": 0.9296875, + "learning_rate": 7.468548387096774e-05, + "loss": 0.1238, + "step": 16196 + }, + { + "epoch": 0.259152, + "grad_norm": 1.21875, + "learning_rate": 7.468387096774194e-05, + "loss": 0.1629, + "step": 16197 + }, + { + "epoch": 0.259168, + "grad_norm": 1.1328125, + "learning_rate": 7.468225806451613e-05, + "loss": 0.1642, + "step": 16198 + }, + { + "epoch": 0.259184, + "grad_norm": 0.72265625, + "learning_rate": 7.468064516129033e-05, + "loss": 0.1957, + "step": 16199 + }, + { + "epoch": 0.2592, + "grad_norm": 1.0, + "learning_rate": 7.467903225806451e-05, + "loss": 0.1562, + "step": 16200 + }, + { + "epoch": 0.259216, + "grad_norm": 0.91015625, + "learning_rate": 7.467741935483871e-05, + "loss": 0.1747, + "step": 16201 + }, + { + "epoch": 0.259232, + "grad_norm": 0.90625, + "learning_rate": 7.467580645161291e-05, + "loss": 0.1547, + "step": 16202 + }, + { + "epoch": 0.259248, + "grad_norm": 0.71484375, + "learning_rate": 7.467419354838711e-05, + "loss": 0.1836, + "step": 16203 + }, + { + "epoch": 0.259264, + "grad_norm": 0.609375, + "learning_rate": 7.46725806451613e-05, + "loss": 0.1592, + "step": 16204 + }, + { + "epoch": 0.25928, + "grad_norm": 0.50390625, + "learning_rate": 7.467096774193548e-05, + "loss": 0.1315, + "step": 16205 + }, + { + "epoch": 0.259296, + "grad_norm": 0.8046875, + "learning_rate": 7.466935483870968e-05, + "loss": 0.1507, + "step": 16206 + }, + { + "epoch": 0.259312, + "grad_norm": 0.60546875, + "learning_rate": 7.466774193548387e-05, + "loss": 0.1833, + "step": 16207 + }, + { + "epoch": 0.259328, + "grad_norm": 0.87109375, + "learning_rate": 7.466612903225807e-05, + "loss": 0.1963, + "step": 16208 + }, + { + "epoch": 0.259344, + "grad_norm": 1.09375, + "learning_rate": 7.466451612903226e-05, + "loss": 0.1733, + "step": 16209 + }, + { + "epoch": 0.25936, + "grad_norm": 0.80078125, + "learning_rate": 7.466290322580646e-05, + "loss": 0.1883, + "step": 16210 + }, + { + "epoch": 0.259376, + "grad_norm": 0.9765625, + "learning_rate": 7.466129032258064e-05, + "loss": 0.1492, + "step": 16211 + }, + { + "epoch": 0.259392, + "grad_norm": 0.7734375, + "learning_rate": 7.465967741935484e-05, + "loss": 0.1554, + "step": 16212 + }, + { + "epoch": 0.259408, + "grad_norm": 0.95703125, + "learning_rate": 7.465806451612903e-05, + "loss": 0.205, + "step": 16213 + }, + { + "epoch": 0.259424, + "grad_norm": 0.5078125, + "learning_rate": 7.465645161290323e-05, + "loss": 0.1422, + "step": 16214 + }, + { + "epoch": 0.25944, + "grad_norm": 0.8515625, + "learning_rate": 7.465483870967743e-05, + "loss": 0.1832, + "step": 16215 + }, + { + "epoch": 0.259456, + "grad_norm": 1.0625, + "learning_rate": 7.465322580645163e-05, + "loss": 0.2005, + "step": 16216 + }, + { + "epoch": 0.259472, + "grad_norm": 1.4921875, + "learning_rate": 7.465161290322581e-05, + "loss": 0.1668, + "step": 16217 + }, + { + "epoch": 0.259488, + "grad_norm": 0.703125, + "learning_rate": 7.465000000000001e-05, + "loss": 0.1726, + "step": 16218 + }, + { + "epoch": 0.259504, + "grad_norm": 1.0703125, + "learning_rate": 7.46483870967742e-05, + "loss": 0.173, + "step": 16219 + }, + { + "epoch": 0.25952, + "grad_norm": 0.59765625, + "learning_rate": 7.464677419354838e-05, + "loss": 0.1411, + "step": 16220 + }, + { + "epoch": 0.259536, + "grad_norm": 1.25, + "learning_rate": 7.464516129032258e-05, + "loss": 0.2081, + "step": 16221 + }, + { + "epoch": 0.259552, + "grad_norm": 0.84765625, + "learning_rate": 7.464354838709677e-05, + "loss": 0.1777, + "step": 16222 + }, + { + "epoch": 0.259568, + "grad_norm": 0.85546875, + "learning_rate": 7.464193548387097e-05, + "loss": 0.1978, + "step": 16223 + }, + { + "epoch": 0.259584, + "grad_norm": 0.81640625, + "learning_rate": 7.464032258064516e-05, + "loss": 0.1351, + "step": 16224 + }, + { + "epoch": 0.2596, + "grad_norm": 1.234375, + "learning_rate": 7.463870967741936e-05, + "loss": 0.1384, + "step": 16225 + }, + { + "epoch": 0.259616, + "grad_norm": 1.1875, + "learning_rate": 7.463709677419355e-05, + "loss": 0.1684, + "step": 16226 + }, + { + "epoch": 0.259632, + "grad_norm": 1.359375, + "learning_rate": 7.463548387096775e-05, + "loss": 0.2025, + "step": 16227 + }, + { + "epoch": 0.259648, + "grad_norm": 0.671875, + "learning_rate": 7.463387096774194e-05, + "loss": 0.1774, + "step": 16228 + }, + { + "epoch": 0.259664, + "grad_norm": 0.578125, + "learning_rate": 7.463225806451614e-05, + "loss": 0.159, + "step": 16229 + }, + { + "epoch": 0.25968, + "grad_norm": 1.078125, + "learning_rate": 7.463064516129033e-05, + "loss": 0.1751, + "step": 16230 + }, + { + "epoch": 0.259696, + "grad_norm": 0.84765625, + "learning_rate": 7.462903225806453e-05, + "loss": 0.173, + "step": 16231 + }, + { + "epoch": 0.259712, + "grad_norm": 0.9609375, + "learning_rate": 7.462741935483871e-05, + "loss": 0.2068, + "step": 16232 + }, + { + "epoch": 0.259728, + "grad_norm": 0.69921875, + "learning_rate": 7.462580645161291e-05, + "loss": 0.1624, + "step": 16233 + }, + { + "epoch": 0.259744, + "grad_norm": 1.5078125, + "learning_rate": 7.46241935483871e-05, + "loss": 0.231, + "step": 16234 + }, + { + "epoch": 0.25976, + "grad_norm": 0.55859375, + "learning_rate": 7.462258064516128e-05, + "loss": 0.1521, + "step": 16235 + }, + { + "epoch": 0.259776, + "grad_norm": 0.5859375, + "learning_rate": 7.462096774193548e-05, + "loss": 0.1653, + "step": 16236 + }, + { + "epoch": 0.259792, + "grad_norm": 1.0, + "learning_rate": 7.461935483870968e-05, + "loss": 0.1693, + "step": 16237 + }, + { + "epoch": 0.259808, + "grad_norm": 0.75, + "learning_rate": 7.461774193548387e-05, + "loss": 0.1753, + "step": 16238 + }, + { + "epoch": 0.259824, + "grad_norm": 0.78125, + "learning_rate": 7.461612903225807e-05, + "loss": 0.178, + "step": 16239 + }, + { + "epoch": 0.25984, + "grad_norm": 0.9140625, + "learning_rate": 7.461451612903227e-05, + "loss": 0.2086, + "step": 16240 + }, + { + "epoch": 0.259856, + "grad_norm": 0.87890625, + "learning_rate": 7.461290322580645e-05, + "loss": 0.1658, + "step": 16241 + }, + { + "epoch": 0.259872, + "grad_norm": 0.578125, + "learning_rate": 7.461129032258065e-05, + "loss": 0.1598, + "step": 16242 + }, + { + "epoch": 0.259888, + "grad_norm": 0.75, + "learning_rate": 7.460967741935484e-05, + "loss": 0.1984, + "step": 16243 + }, + { + "epoch": 0.259904, + "grad_norm": 0.921875, + "learning_rate": 7.460806451612904e-05, + "loss": 0.1614, + "step": 16244 + }, + { + "epoch": 0.25992, + "grad_norm": 1.0703125, + "learning_rate": 7.460645161290323e-05, + "loss": 0.1799, + "step": 16245 + }, + { + "epoch": 0.259936, + "grad_norm": 0.65625, + "learning_rate": 7.460483870967743e-05, + "loss": 0.1306, + "step": 16246 + }, + { + "epoch": 0.259952, + "grad_norm": 0.90234375, + "learning_rate": 7.460322580645161e-05, + "loss": 0.193, + "step": 16247 + }, + { + "epoch": 0.259968, + "grad_norm": 0.6953125, + "learning_rate": 7.460161290322581e-05, + "loss": 0.172, + "step": 16248 + }, + { + "epoch": 0.259984, + "grad_norm": 0.90234375, + "learning_rate": 7.46e-05, + "loss": 0.1904, + "step": 16249 + }, + { + "epoch": 0.26, + "grad_norm": 0.75, + "learning_rate": 7.45983870967742e-05, + "loss": 0.143, + "step": 16250 + }, + { + "epoch": 0.260016, + "grad_norm": 0.671875, + "learning_rate": 7.45967741935484e-05, + "loss": 0.1671, + "step": 16251 + }, + { + "epoch": 0.260032, + "grad_norm": 0.83203125, + "learning_rate": 7.459516129032258e-05, + "loss": 0.1857, + "step": 16252 + }, + { + "epoch": 0.260048, + "grad_norm": 1.3046875, + "learning_rate": 7.459354838709678e-05, + "loss": 0.1969, + "step": 16253 + }, + { + "epoch": 0.260064, + "grad_norm": 0.73828125, + "learning_rate": 7.459193548387097e-05, + "loss": 0.1725, + "step": 16254 + }, + { + "epoch": 0.26008, + "grad_norm": 0.47265625, + "learning_rate": 7.459032258064517e-05, + "loss": 0.138, + "step": 16255 + }, + { + "epoch": 0.260096, + "grad_norm": 0.9375, + "learning_rate": 7.458870967741935e-05, + "loss": 0.1758, + "step": 16256 + }, + { + "epoch": 0.260112, + "grad_norm": 0.7734375, + "learning_rate": 7.458709677419355e-05, + "loss": 0.171, + "step": 16257 + }, + { + "epoch": 0.260128, + "grad_norm": 0.7109375, + "learning_rate": 7.458548387096774e-05, + "loss": 0.1658, + "step": 16258 + }, + { + "epoch": 0.260144, + "grad_norm": 0.7578125, + "learning_rate": 7.458387096774194e-05, + "loss": 0.181, + "step": 16259 + }, + { + "epoch": 0.26016, + "grad_norm": 0.578125, + "learning_rate": 7.458225806451613e-05, + "loss": 0.1575, + "step": 16260 + }, + { + "epoch": 0.260176, + "grad_norm": 0.85546875, + "learning_rate": 7.458064516129033e-05, + "loss": 0.1623, + "step": 16261 + }, + { + "epoch": 0.260192, + "grad_norm": 0.72265625, + "learning_rate": 7.457903225806452e-05, + "loss": 0.1514, + "step": 16262 + }, + { + "epoch": 0.260208, + "grad_norm": 0.75, + "learning_rate": 7.457741935483872e-05, + "loss": 0.1548, + "step": 16263 + }, + { + "epoch": 0.260224, + "grad_norm": 0.76171875, + "learning_rate": 7.457580645161291e-05, + "loss": 0.1866, + "step": 16264 + }, + { + "epoch": 0.26024, + "grad_norm": 0.6875, + "learning_rate": 7.457419354838711e-05, + "loss": 0.1964, + "step": 16265 + }, + { + "epoch": 0.260256, + "grad_norm": 0.7109375, + "learning_rate": 7.45725806451613e-05, + "loss": 0.1815, + "step": 16266 + }, + { + "epoch": 0.260272, + "grad_norm": 0.96875, + "learning_rate": 7.457096774193548e-05, + "loss": 0.1599, + "step": 16267 + }, + { + "epoch": 0.260288, + "grad_norm": 0.60546875, + "learning_rate": 7.456935483870968e-05, + "loss": 0.1634, + "step": 16268 + }, + { + "epoch": 0.260304, + "grad_norm": 0.625, + "learning_rate": 7.456774193548387e-05, + "loss": 0.136, + "step": 16269 + }, + { + "epoch": 0.26032, + "grad_norm": 0.8515625, + "learning_rate": 7.456612903225807e-05, + "loss": 0.1478, + "step": 16270 + }, + { + "epoch": 0.260336, + "grad_norm": 0.73828125, + "learning_rate": 7.456451612903225e-05, + "loss": 0.1465, + "step": 16271 + }, + { + "epoch": 0.260352, + "grad_norm": 0.8046875, + "learning_rate": 7.456290322580645e-05, + "loss": 0.1436, + "step": 16272 + }, + { + "epoch": 0.260368, + "grad_norm": 0.470703125, + "learning_rate": 7.456129032258064e-05, + "loss": 0.1702, + "step": 16273 + }, + { + "epoch": 0.260384, + "grad_norm": 0.83203125, + "learning_rate": 7.455967741935484e-05, + "loss": 0.2148, + "step": 16274 + }, + { + "epoch": 0.2604, + "grad_norm": 0.890625, + "learning_rate": 7.455806451612904e-05, + "loss": 0.164, + "step": 16275 + }, + { + "epoch": 0.260416, + "grad_norm": 0.82421875, + "learning_rate": 7.455645161290324e-05, + "loss": 0.1655, + "step": 16276 + }, + { + "epoch": 0.260432, + "grad_norm": 0.71484375, + "learning_rate": 7.455483870967742e-05, + "loss": 0.204, + "step": 16277 + }, + { + "epoch": 0.260448, + "grad_norm": 1.25, + "learning_rate": 7.455322580645162e-05, + "loss": 0.1738, + "step": 16278 + }, + { + "epoch": 0.260464, + "grad_norm": 0.703125, + "learning_rate": 7.455161290322581e-05, + "loss": 0.167, + "step": 16279 + }, + { + "epoch": 0.26048, + "grad_norm": 0.73046875, + "learning_rate": 7.455000000000001e-05, + "loss": 0.166, + "step": 16280 + }, + { + "epoch": 0.260496, + "grad_norm": 0.78515625, + "learning_rate": 7.45483870967742e-05, + "loss": 0.1627, + "step": 16281 + }, + { + "epoch": 0.260512, + "grad_norm": 1.1328125, + "learning_rate": 7.454677419354838e-05, + "loss": 0.1694, + "step": 16282 + }, + { + "epoch": 0.260528, + "grad_norm": 0.89453125, + "learning_rate": 7.454516129032258e-05, + "loss": 0.1736, + "step": 16283 + }, + { + "epoch": 0.260544, + "grad_norm": 0.91796875, + "learning_rate": 7.454354838709677e-05, + "loss": 0.1918, + "step": 16284 + }, + { + "epoch": 0.26056, + "grad_norm": 0.66796875, + "learning_rate": 7.454193548387097e-05, + "loss": 0.2129, + "step": 16285 + }, + { + "epoch": 0.260576, + "grad_norm": 0.7578125, + "learning_rate": 7.454032258064517e-05, + "loss": 0.1902, + "step": 16286 + }, + { + "epoch": 0.260592, + "grad_norm": 0.51953125, + "learning_rate": 7.453870967741937e-05, + "loss": 0.1375, + "step": 16287 + }, + { + "epoch": 0.260608, + "grad_norm": 0.8359375, + "learning_rate": 7.453709677419355e-05, + "loss": 0.171, + "step": 16288 + }, + { + "epoch": 0.260624, + "grad_norm": 0.8046875, + "learning_rate": 7.453548387096775e-05, + "loss": 0.1378, + "step": 16289 + }, + { + "epoch": 0.26064, + "grad_norm": 0.72265625, + "learning_rate": 7.453387096774194e-05, + "loss": 0.1605, + "step": 16290 + }, + { + "epoch": 0.260656, + "grad_norm": 0.70703125, + "learning_rate": 7.453225806451614e-05, + "loss": 0.1578, + "step": 16291 + }, + { + "epoch": 0.260672, + "grad_norm": 0.5859375, + "learning_rate": 7.453064516129032e-05, + "loss": 0.1611, + "step": 16292 + }, + { + "epoch": 0.260688, + "grad_norm": 0.68359375, + "learning_rate": 7.452903225806452e-05, + "loss": 0.2072, + "step": 16293 + }, + { + "epoch": 0.260704, + "grad_norm": 0.73046875, + "learning_rate": 7.452741935483871e-05, + "loss": 0.219, + "step": 16294 + }, + { + "epoch": 0.26072, + "grad_norm": 0.91015625, + "learning_rate": 7.452580645161291e-05, + "loss": 0.1555, + "step": 16295 + }, + { + "epoch": 0.260736, + "grad_norm": 0.6875, + "learning_rate": 7.45241935483871e-05, + "loss": 0.1849, + "step": 16296 + }, + { + "epoch": 0.260752, + "grad_norm": 0.7734375, + "learning_rate": 7.45225806451613e-05, + "loss": 0.1912, + "step": 16297 + }, + { + "epoch": 0.260768, + "grad_norm": 0.84375, + "learning_rate": 7.45209677419355e-05, + "loss": 0.2084, + "step": 16298 + }, + { + "epoch": 0.260784, + "grad_norm": 0.71484375, + "learning_rate": 7.451935483870968e-05, + "loss": 0.134, + "step": 16299 + }, + { + "epoch": 0.2608, + "grad_norm": 0.58984375, + "learning_rate": 7.451774193548388e-05, + "loss": 0.1712, + "step": 16300 + }, + { + "epoch": 0.260816, + "grad_norm": 0.65234375, + "learning_rate": 7.451612903225807e-05, + "loss": 0.2089, + "step": 16301 + }, + { + "epoch": 0.260832, + "grad_norm": 0.53125, + "learning_rate": 7.451451612903227e-05, + "loss": 0.1904, + "step": 16302 + }, + { + "epoch": 0.260848, + "grad_norm": 0.8046875, + "learning_rate": 7.451290322580645e-05, + "loss": 0.154, + "step": 16303 + }, + { + "epoch": 0.260864, + "grad_norm": 0.62890625, + "learning_rate": 7.451129032258065e-05, + "loss": 0.1436, + "step": 16304 + }, + { + "epoch": 0.26088, + "grad_norm": 0.6484375, + "learning_rate": 7.450967741935484e-05, + "loss": 0.146, + "step": 16305 + }, + { + "epoch": 0.260896, + "grad_norm": 0.78515625, + "learning_rate": 7.450806451612904e-05, + "loss": 0.156, + "step": 16306 + }, + { + "epoch": 0.260912, + "grad_norm": 0.76171875, + "learning_rate": 7.450645161290322e-05, + "loss": 0.1785, + "step": 16307 + }, + { + "epoch": 0.260928, + "grad_norm": 1.4296875, + "learning_rate": 7.450483870967742e-05, + "loss": 0.1931, + "step": 16308 + }, + { + "epoch": 0.260944, + "grad_norm": 0.6328125, + "learning_rate": 7.450322580645161e-05, + "loss": 0.1551, + "step": 16309 + }, + { + "epoch": 0.26096, + "grad_norm": 0.90625, + "learning_rate": 7.450161290322581e-05, + "loss": 0.1828, + "step": 16310 + }, + { + "epoch": 0.260976, + "grad_norm": 0.6015625, + "learning_rate": 7.450000000000001e-05, + "loss": 0.1723, + "step": 16311 + }, + { + "epoch": 0.260992, + "grad_norm": 0.78125, + "learning_rate": 7.449838709677421e-05, + "loss": 0.1654, + "step": 16312 + }, + { + "epoch": 0.261008, + "grad_norm": 0.8359375, + "learning_rate": 7.44967741935484e-05, + "loss": 0.1656, + "step": 16313 + }, + { + "epoch": 0.261024, + "grad_norm": 0.93359375, + "learning_rate": 7.449516129032258e-05, + "loss": 0.2101, + "step": 16314 + }, + { + "epoch": 0.26104, + "grad_norm": 1.0703125, + "learning_rate": 7.449354838709678e-05, + "loss": 0.2028, + "step": 16315 + }, + { + "epoch": 0.261056, + "grad_norm": 0.5546875, + "learning_rate": 7.449193548387097e-05, + "loss": 0.149, + "step": 16316 + }, + { + "epoch": 0.261072, + "grad_norm": 0.875, + "learning_rate": 7.449032258064517e-05, + "loss": 0.1478, + "step": 16317 + }, + { + "epoch": 0.261088, + "grad_norm": 0.98828125, + "learning_rate": 7.448870967741935e-05, + "loss": 0.1673, + "step": 16318 + }, + { + "epoch": 0.261104, + "grad_norm": 0.828125, + "learning_rate": 7.448709677419355e-05, + "loss": 0.1442, + "step": 16319 + }, + { + "epoch": 0.26112, + "grad_norm": 0.703125, + "learning_rate": 7.448548387096774e-05, + "loss": 0.1633, + "step": 16320 + }, + { + "epoch": 0.261136, + "grad_norm": 0.98828125, + "learning_rate": 7.448387096774194e-05, + "loss": 0.2019, + "step": 16321 + }, + { + "epoch": 0.261152, + "grad_norm": 1.0234375, + "learning_rate": 7.448225806451614e-05, + "loss": 0.1647, + "step": 16322 + }, + { + "epoch": 0.261168, + "grad_norm": 1.1640625, + "learning_rate": 7.448064516129034e-05, + "loss": 0.1936, + "step": 16323 + }, + { + "epoch": 0.261184, + "grad_norm": 0.9609375, + "learning_rate": 7.447903225806452e-05, + "loss": 0.1571, + "step": 16324 + }, + { + "epoch": 0.2612, + "grad_norm": 0.703125, + "learning_rate": 7.447741935483872e-05, + "loss": 0.1838, + "step": 16325 + }, + { + "epoch": 0.261216, + "grad_norm": 0.875, + "learning_rate": 7.447580645161291e-05, + "loss": 0.2085, + "step": 16326 + }, + { + "epoch": 0.261232, + "grad_norm": 0.83203125, + "learning_rate": 7.447419354838711e-05, + "loss": 0.1633, + "step": 16327 + }, + { + "epoch": 0.261248, + "grad_norm": 2.109375, + "learning_rate": 7.44725806451613e-05, + "loss": 0.1642, + "step": 16328 + }, + { + "epoch": 0.261264, + "grad_norm": 0.68359375, + "learning_rate": 7.447096774193548e-05, + "loss": 0.1823, + "step": 16329 + }, + { + "epoch": 0.26128, + "grad_norm": 0.625, + "learning_rate": 7.446935483870968e-05, + "loss": 0.1574, + "step": 16330 + }, + { + "epoch": 0.261296, + "grad_norm": 0.51953125, + "learning_rate": 7.446774193548387e-05, + "loss": 0.1827, + "step": 16331 + }, + { + "epoch": 0.261312, + "grad_norm": 1.1875, + "learning_rate": 7.446612903225807e-05, + "loss": 0.1604, + "step": 16332 + }, + { + "epoch": 0.261328, + "grad_norm": 1.296875, + "learning_rate": 7.446451612903226e-05, + "loss": 0.2195, + "step": 16333 + }, + { + "epoch": 0.261344, + "grad_norm": 0.765625, + "learning_rate": 7.446290322580645e-05, + "loss": 0.1718, + "step": 16334 + }, + { + "epoch": 0.26136, + "grad_norm": 0.89453125, + "learning_rate": 7.446129032258065e-05, + "loss": 0.2078, + "step": 16335 + }, + { + "epoch": 0.261376, + "grad_norm": 0.5859375, + "learning_rate": 7.445967741935485e-05, + "loss": 0.1909, + "step": 16336 + }, + { + "epoch": 0.261392, + "grad_norm": 0.78515625, + "learning_rate": 7.445806451612904e-05, + "loss": 0.1907, + "step": 16337 + }, + { + "epoch": 0.261408, + "grad_norm": 0.640625, + "learning_rate": 7.445645161290324e-05, + "loss": 0.1634, + "step": 16338 + }, + { + "epoch": 0.261424, + "grad_norm": 0.5859375, + "learning_rate": 7.445483870967742e-05, + "loss": 0.1902, + "step": 16339 + }, + { + "epoch": 0.26144, + "grad_norm": 0.49609375, + "learning_rate": 7.445322580645162e-05, + "loss": 0.1747, + "step": 16340 + }, + { + "epoch": 0.261456, + "grad_norm": 0.6484375, + "learning_rate": 7.445161290322581e-05, + "loss": 0.1936, + "step": 16341 + }, + { + "epoch": 0.261472, + "grad_norm": 0.74609375, + "learning_rate": 7.445000000000001e-05, + "loss": 0.1405, + "step": 16342 + }, + { + "epoch": 0.261488, + "grad_norm": 0.671875, + "learning_rate": 7.44483870967742e-05, + "loss": 0.1996, + "step": 16343 + }, + { + "epoch": 0.261504, + "grad_norm": 1.375, + "learning_rate": 7.444677419354838e-05, + "loss": 0.1623, + "step": 16344 + }, + { + "epoch": 0.26152, + "grad_norm": 0.86328125, + "learning_rate": 7.444516129032258e-05, + "loss": 0.18, + "step": 16345 + }, + { + "epoch": 0.261536, + "grad_norm": 1.0234375, + "learning_rate": 7.444354838709678e-05, + "loss": 0.1805, + "step": 16346 + }, + { + "epoch": 0.261552, + "grad_norm": 0.8203125, + "learning_rate": 7.444193548387098e-05, + "loss": 0.206, + "step": 16347 + }, + { + "epoch": 0.261568, + "grad_norm": 0.6875, + "learning_rate": 7.444032258064516e-05, + "loss": 0.1581, + "step": 16348 + }, + { + "epoch": 0.261584, + "grad_norm": 0.87890625, + "learning_rate": 7.443870967741936e-05, + "loss": 0.1767, + "step": 16349 + }, + { + "epoch": 0.2616, + "grad_norm": 0.78515625, + "learning_rate": 7.443709677419355e-05, + "loss": 0.1558, + "step": 16350 + }, + { + "epoch": 0.261616, + "grad_norm": 0.85546875, + "learning_rate": 7.443548387096775e-05, + "loss": 0.1888, + "step": 16351 + }, + { + "epoch": 0.261632, + "grad_norm": 0.47265625, + "learning_rate": 7.443387096774194e-05, + "loss": 0.1293, + "step": 16352 + }, + { + "epoch": 0.261648, + "grad_norm": 0.87890625, + "learning_rate": 7.443225806451614e-05, + "loss": 0.1436, + "step": 16353 + }, + { + "epoch": 0.261664, + "grad_norm": 0.8359375, + "learning_rate": 7.443064516129032e-05, + "loss": 0.2221, + "step": 16354 + }, + { + "epoch": 0.26168, + "grad_norm": 0.63671875, + "learning_rate": 7.442903225806452e-05, + "loss": 0.1904, + "step": 16355 + }, + { + "epoch": 0.261696, + "grad_norm": 1.0703125, + "learning_rate": 7.442741935483871e-05, + "loss": 0.167, + "step": 16356 + }, + { + "epoch": 0.261712, + "grad_norm": 0.65625, + "learning_rate": 7.44258064516129e-05, + "loss": 0.162, + "step": 16357 + }, + { + "epoch": 0.261728, + "grad_norm": 0.94140625, + "learning_rate": 7.44241935483871e-05, + "loss": 0.1367, + "step": 16358 + }, + { + "epoch": 0.261744, + "grad_norm": 0.59765625, + "learning_rate": 7.442258064516129e-05, + "loss": 0.1433, + "step": 16359 + }, + { + "epoch": 0.26176, + "grad_norm": 0.86328125, + "learning_rate": 7.442096774193549e-05, + "loss": 0.1984, + "step": 16360 + }, + { + "epoch": 0.261776, + "grad_norm": 0.671875, + "learning_rate": 7.441935483870968e-05, + "loss": 0.1871, + "step": 16361 + }, + { + "epoch": 0.261792, + "grad_norm": 0.75390625, + "learning_rate": 7.441774193548388e-05, + "loss": 0.1745, + "step": 16362 + }, + { + "epoch": 0.261808, + "grad_norm": 0.5625, + "learning_rate": 7.441612903225806e-05, + "loss": 0.1752, + "step": 16363 + }, + { + "epoch": 0.261824, + "grad_norm": 0.6796875, + "learning_rate": 7.441451612903226e-05, + "loss": 0.1777, + "step": 16364 + }, + { + "epoch": 0.26184, + "grad_norm": 0.67578125, + "learning_rate": 7.441290322580645e-05, + "loss": 0.1882, + "step": 16365 + }, + { + "epoch": 0.261856, + "grad_norm": 1.1875, + "learning_rate": 7.441129032258065e-05, + "loss": 0.1681, + "step": 16366 + }, + { + "epoch": 0.261872, + "grad_norm": 1.0390625, + "learning_rate": 7.440967741935484e-05, + "loss": 0.1984, + "step": 16367 + }, + { + "epoch": 0.261888, + "grad_norm": 0.828125, + "learning_rate": 7.440806451612903e-05, + "loss": 0.1592, + "step": 16368 + }, + { + "epoch": 0.261904, + "grad_norm": 0.96484375, + "learning_rate": 7.440645161290322e-05, + "loss": 0.2068, + "step": 16369 + }, + { + "epoch": 0.26192, + "grad_norm": 0.70703125, + "learning_rate": 7.440483870967742e-05, + "loss": 0.1803, + "step": 16370 + }, + { + "epoch": 0.261936, + "grad_norm": 0.90234375, + "learning_rate": 7.440322580645162e-05, + "loss": 0.1589, + "step": 16371 + }, + { + "epoch": 0.261952, + "grad_norm": 0.69921875, + "learning_rate": 7.440161290322582e-05, + "loss": 0.1732, + "step": 16372 + }, + { + "epoch": 0.261968, + "grad_norm": 1.0625, + "learning_rate": 7.44e-05, + "loss": 0.2306, + "step": 16373 + }, + { + "epoch": 0.261984, + "grad_norm": 0.7890625, + "learning_rate": 7.43983870967742e-05, + "loss": 0.1373, + "step": 16374 + }, + { + "epoch": 0.262, + "grad_norm": 0.703125, + "learning_rate": 7.439677419354839e-05, + "loss": 0.215, + "step": 16375 + }, + { + "epoch": 0.262016, + "grad_norm": 0.921875, + "learning_rate": 7.439516129032258e-05, + "loss": 0.178, + "step": 16376 + }, + { + "epoch": 0.262032, + "grad_norm": 1.1328125, + "learning_rate": 7.439354838709678e-05, + "loss": 0.1603, + "step": 16377 + }, + { + "epoch": 0.262048, + "grad_norm": 1.078125, + "learning_rate": 7.439193548387096e-05, + "loss": 0.188, + "step": 16378 + }, + { + "epoch": 0.262064, + "grad_norm": 1.40625, + "learning_rate": 7.439032258064516e-05, + "loss": 0.1538, + "step": 16379 + }, + { + "epoch": 0.26208, + "grad_norm": 0.59765625, + "learning_rate": 7.438870967741935e-05, + "loss": 0.1893, + "step": 16380 + }, + { + "epoch": 0.262096, + "grad_norm": 0.77734375, + "learning_rate": 7.438709677419355e-05, + "loss": 0.1683, + "step": 16381 + }, + { + "epoch": 0.262112, + "grad_norm": 0.828125, + "learning_rate": 7.438548387096775e-05, + "loss": 0.1524, + "step": 16382 + }, + { + "epoch": 0.262128, + "grad_norm": 0.96875, + "learning_rate": 7.438387096774195e-05, + "loss": 0.1945, + "step": 16383 + }, + { + "epoch": 0.262144, + "grad_norm": 0.609375, + "learning_rate": 7.438225806451613e-05, + "loss": 0.1638, + "step": 16384 + }, + { + "epoch": 0.26216, + "grad_norm": 0.6015625, + "learning_rate": 7.438064516129033e-05, + "loss": 0.1683, + "step": 16385 + }, + { + "epoch": 0.262176, + "grad_norm": 1.2890625, + "learning_rate": 7.437903225806452e-05, + "loss": 0.1747, + "step": 16386 + }, + { + "epoch": 0.262192, + "grad_norm": 0.51953125, + "learning_rate": 7.437741935483872e-05, + "loss": 0.1459, + "step": 16387 + }, + { + "epoch": 0.262208, + "grad_norm": 0.80859375, + "learning_rate": 7.43758064516129e-05, + "loss": 0.1727, + "step": 16388 + }, + { + "epoch": 0.262224, + "grad_norm": 0.80078125, + "learning_rate": 7.43741935483871e-05, + "loss": 0.1825, + "step": 16389 + }, + { + "epoch": 0.26224, + "grad_norm": 0.6015625, + "learning_rate": 7.437258064516129e-05, + "loss": 0.1631, + "step": 16390 + }, + { + "epoch": 0.262256, + "grad_norm": 0.921875, + "learning_rate": 7.437096774193548e-05, + "loss": 0.158, + "step": 16391 + }, + { + "epoch": 0.262272, + "grad_norm": 0.80078125, + "learning_rate": 7.436935483870968e-05, + "loss": 0.1606, + "step": 16392 + }, + { + "epoch": 0.262288, + "grad_norm": 0.625, + "learning_rate": 7.436774193548388e-05, + "loss": 0.1265, + "step": 16393 + }, + { + "epoch": 0.262304, + "grad_norm": 0.72265625, + "learning_rate": 7.436612903225808e-05, + "loss": 0.1796, + "step": 16394 + }, + { + "epoch": 0.26232, + "grad_norm": 0.6953125, + "learning_rate": 7.436451612903226e-05, + "loss": 0.1598, + "step": 16395 + }, + { + "epoch": 0.262336, + "grad_norm": 0.9140625, + "learning_rate": 7.436290322580646e-05, + "loss": 0.2056, + "step": 16396 + }, + { + "epoch": 0.262352, + "grad_norm": 0.97265625, + "learning_rate": 7.436129032258065e-05, + "loss": 0.1648, + "step": 16397 + }, + { + "epoch": 0.262368, + "grad_norm": 0.79296875, + "learning_rate": 7.435967741935485e-05, + "loss": 0.1752, + "step": 16398 + }, + { + "epoch": 0.262384, + "grad_norm": 1.40625, + "learning_rate": 7.435806451612903e-05, + "loss": 0.191, + "step": 16399 + }, + { + "epoch": 0.2624, + "grad_norm": 0.6171875, + "learning_rate": 7.435645161290323e-05, + "loss": 0.1517, + "step": 16400 + }, + { + "epoch": 0.262416, + "grad_norm": 1.3046875, + "learning_rate": 7.435483870967742e-05, + "loss": 0.1646, + "step": 16401 + }, + { + "epoch": 0.262432, + "grad_norm": 0.796875, + "learning_rate": 7.435322580645162e-05, + "loss": 0.1598, + "step": 16402 + }, + { + "epoch": 0.262448, + "grad_norm": 0.671875, + "learning_rate": 7.43516129032258e-05, + "loss": 0.1926, + "step": 16403 + }, + { + "epoch": 0.262464, + "grad_norm": 0.77734375, + "learning_rate": 7.435e-05, + "loss": 0.1648, + "step": 16404 + }, + { + "epoch": 0.26248, + "grad_norm": 0.81640625, + "learning_rate": 7.434838709677419e-05, + "loss": 0.1899, + "step": 16405 + }, + { + "epoch": 0.262496, + "grad_norm": 0.765625, + "learning_rate": 7.434677419354839e-05, + "loss": 0.1688, + "step": 16406 + }, + { + "epoch": 0.262512, + "grad_norm": 1.75, + "learning_rate": 7.434516129032259e-05, + "loss": 0.2347, + "step": 16407 + }, + { + "epoch": 0.262528, + "grad_norm": 0.62890625, + "learning_rate": 7.434354838709678e-05, + "loss": 0.1761, + "step": 16408 + }, + { + "epoch": 0.262544, + "grad_norm": 0.76171875, + "learning_rate": 7.434193548387098e-05, + "loss": 0.1353, + "step": 16409 + }, + { + "epoch": 0.26256, + "grad_norm": 0.70703125, + "learning_rate": 7.434032258064516e-05, + "loss": 0.1905, + "step": 16410 + }, + { + "epoch": 0.262576, + "grad_norm": 0.66015625, + "learning_rate": 7.433870967741936e-05, + "loss": 0.1383, + "step": 16411 + }, + { + "epoch": 0.262592, + "grad_norm": 0.796875, + "learning_rate": 7.433709677419355e-05, + "loss": 0.1545, + "step": 16412 + }, + { + "epoch": 0.262608, + "grad_norm": 0.734375, + "learning_rate": 7.433548387096775e-05, + "loss": 0.1351, + "step": 16413 + }, + { + "epoch": 0.262624, + "grad_norm": 0.62109375, + "learning_rate": 7.433387096774193e-05, + "loss": 0.1645, + "step": 16414 + }, + { + "epoch": 0.26264, + "grad_norm": 1.015625, + "learning_rate": 7.433225806451613e-05, + "loss": 0.175, + "step": 16415 + }, + { + "epoch": 0.262656, + "grad_norm": 0.80859375, + "learning_rate": 7.433064516129032e-05, + "loss": 0.1911, + "step": 16416 + }, + { + "epoch": 0.262672, + "grad_norm": 1.0234375, + "learning_rate": 7.432903225806452e-05, + "loss": 0.1588, + "step": 16417 + }, + { + "epoch": 0.262688, + "grad_norm": 1.4140625, + "learning_rate": 7.432741935483872e-05, + "loss": 0.1512, + "step": 16418 + }, + { + "epoch": 0.262704, + "grad_norm": 0.87109375, + "learning_rate": 7.432580645161292e-05, + "loss": 0.1616, + "step": 16419 + }, + { + "epoch": 0.26272, + "grad_norm": 1.140625, + "learning_rate": 7.43241935483871e-05, + "loss": 0.1688, + "step": 16420 + }, + { + "epoch": 0.262736, + "grad_norm": 0.71484375, + "learning_rate": 7.43225806451613e-05, + "loss": 0.1797, + "step": 16421 + }, + { + "epoch": 0.262752, + "grad_norm": 0.5625, + "learning_rate": 7.432096774193549e-05, + "loss": 0.1656, + "step": 16422 + }, + { + "epoch": 0.262768, + "grad_norm": 0.59765625, + "learning_rate": 7.431935483870968e-05, + "loss": 0.1655, + "step": 16423 + }, + { + "epoch": 0.262784, + "grad_norm": 1.03125, + "learning_rate": 7.431774193548388e-05, + "loss": 0.1573, + "step": 16424 + }, + { + "epoch": 0.2628, + "grad_norm": 0.69921875, + "learning_rate": 7.431612903225806e-05, + "loss": 0.1698, + "step": 16425 + }, + { + "epoch": 0.262816, + "grad_norm": 1.34375, + "learning_rate": 7.431451612903226e-05, + "loss": 0.1867, + "step": 16426 + }, + { + "epoch": 0.262832, + "grad_norm": 0.6171875, + "learning_rate": 7.431290322580645e-05, + "loss": 0.2005, + "step": 16427 + }, + { + "epoch": 0.262848, + "grad_norm": 1.09375, + "learning_rate": 7.431129032258065e-05, + "loss": 0.164, + "step": 16428 + }, + { + "epoch": 0.262864, + "grad_norm": 0.94140625, + "learning_rate": 7.430967741935483e-05, + "loss": 0.1892, + "step": 16429 + }, + { + "epoch": 0.26288, + "grad_norm": 0.78125, + "learning_rate": 7.430806451612903e-05, + "loss": 0.1608, + "step": 16430 + }, + { + "epoch": 0.262896, + "grad_norm": 0.94921875, + "learning_rate": 7.430645161290323e-05, + "loss": 0.1555, + "step": 16431 + }, + { + "epoch": 0.262912, + "grad_norm": 0.84765625, + "learning_rate": 7.430483870967743e-05, + "loss": 0.2019, + "step": 16432 + }, + { + "epoch": 0.262928, + "grad_norm": 0.51171875, + "learning_rate": 7.430322580645162e-05, + "loss": 0.1395, + "step": 16433 + }, + { + "epoch": 0.262944, + "grad_norm": 0.77734375, + "learning_rate": 7.430161290322582e-05, + "loss": 0.148, + "step": 16434 + }, + { + "epoch": 0.26296, + "grad_norm": 0.63671875, + "learning_rate": 7.43e-05, + "loss": 0.1566, + "step": 16435 + }, + { + "epoch": 0.262976, + "grad_norm": 0.5234375, + "learning_rate": 7.42983870967742e-05, + "loss": 0.1451, + "step": 16436 + }, + { + "epoch": 0.262992, + "grad_norm": 1.3359375, + "learning_rate": 7.429677419354839e-05, + "loss": 0.2, + "step": 16437 + }, + { + "epoch": 0.263008, + "grad_norm": 1.078125, + "learning_rate": 7.429516129032258e-05, + "loss": 0.1945, + "step": 16438 + }, + { + "epoch": 0.263024, + "grad_norm": 0.67578125, + "learning_rate": 7.429354838709677e-05, + "loss": 0.1418, + "step": 16439 + }, + { + "epoch": 0.26304, + "grad_norm": 1.0078125, + "learning_rate": 7.429193548387096e-05, + "loss": 0.144, + "step": 16440 + }, + { + "epoch": 0.263056, + "grad_norm": 0.765625, + "learning_rate": 7.429032258064516e-05, + "loss": 0.1501, + "step": 16441 + }, + { + "epoch": 0.263072, + "grad_norm": 0.81640625, + "learning_rate": 7.428870967741936e-05, + "loss": 0.1643, + "step": 16442 + }, + { + "epoch": 0.263088, + "grad_norm": 0.76171875, + "learning_rate": 7.428709677419356e-05, + "loss": 0.1429, + "step": 16443 + }, + { + "epoch": 0.263104, + "grad_norm": 0.7109375, + "learning_rate": 7.428548387096775e-05, + "loss": 0.1759, + "step": 16444 + }, + { + "epoch": 0.26312, + "grad_norm": 1.2578125, + "learning_rate": 7.428387096774195e-05, + "loss": 0.1662, + "step": 16445 + }, + { + "epoch": 0.263136, + "grad_norm": 0.78125, + "learning_rate": 7.428225806451613e-05, + "loss": 0.2095, + "step": 16446 + }, + { + "epoch": 0.263152, + "grad_norm": 0.94921875, + "learning_rate": 7.428064516129033e-05, + "loss": 0.1632, + "step": 16447 + }, + { + "epoch": 0.263168, + "grad_norm": 0.90234375, + "learning_rate": 7.427903225806452e-05, + "loss": 0.1442, + "step": 16448 + }, + { + "epoch": 0.263184, + "grad_norm": 0.7734375, + "learning_rate": 7.427741935483872e-05, + "loss": 0.1888, + "step": 16449 + }, + { + "epoch": 0.2632, + "grad_norm": 0.85546875, + "learning_rate": 7.42758064516129e-05, + "loss": 0.1988, + "step": 16450 + }, + { + "epoch": 0.263216, + "grad_norm": 0.6640625, + "learning_rate": 7.42741935483871e-05, + "loss": 0.1605, + "step": 16451 + }, + { + "epoch": 0.263232, + "grad_norm": 1.0, + "learning_rate": 7.427258064516129e-05, + "loss": 0.1931, + "step": 16452 + }, + { + "epoch": 0.263248, + "grad_norm": 0.7265625, + "learning_rate": 7.427096774193549e-05, + "loss": 0.1483, + "step": 16453 + }, + { + "epoch": 0.263264, + "grad_norm": 0.671875, + "learning_rate": 7.426935483870969e-05, + "loss": 0.1542, + "step": 16454 + }, + { + "epoch": 0.26328, + "grad_norm": 1.09375, + "learning_rate": 7.426774193548387e-05, + "loss": 0.1185, + "step": 16455 + }, + { + "epoch": 0.263296, + "grad_norm": 0.94921875, + "learning_rate": 7.426612903225807e-05, + "loss": 0.1956, + "step": 16456 + }, + { + "epoch": 0.263312, + "grad_norm": 1.1875, + "learning_rate": 7.426451612903226e-05, + "loss": 0.1893, + "step": 16457 + }, + { + "epoch": 0.263328, + "grad_norm": 0.51953125, + "learning_rate": 7.426290322580646e-05, + "loss": 0.1554, + "step": 16458 + }, + { + "epoch": 0.263344, + "grad_norm": 0.423828125, + "learning_rate": 7.426129032258065e-05, + "loss": 0.1359, + "step": 16459 + }, + { + "epoch": 0.26336, + "grad_norm": 1.1796875, + "learning_rate": 7.425967741935485e-05, + "loss": 0.1923, + "step": 16460 + }, + { + "epoch": 0.263376, + "grad_norm": 1.265625, + "learning_rate": 7.425806451612903e-05, + "loss": 0.199, + "step": 16461 + }, + { + "epoch": 0.263392, + "grad_norm": 0.6796875, + "learning_rate": 7.425645161290323e-05, + "loss": 0.1553, + "step": 16462 + }, + { + "epoch": 0.263408, + "grad_norm": 0.8046875, + "learning_rate": 7.425483870967742e-05, + "loss": 0.1468, + "step": 16463 + }, + { + "epoch": 0.263424, + "grad_norm": 0.8984375, + "learning_rate": 7.425322580645162e-05, + "loss": 0.197, + "step": 16464 + }, + { + "epoch": 0.26344, + "grad_norm": 0.58984375, + "learning_rate": 7.42516129032258e-05, + "loss": 0.1758, + "step": 16465 + }, + { + "epoch": 0.263456, + "grad_norm": 0.80859375, + "learning_rate": 7.425e-05, + "loss": 0.1952, + "step": 16466 + }, + { + "epoch": 0.263472, + "grad_norm": 0.6171875, + "learning_rate": 7.42483870967742e-05, + "loss": 0.162, + "step": 16467 + }, + { + "epoch": 0.263488, + "grad_norm": 1.15625, + "learning_rate": 7.424677419354839e-05, + "loss": 0.1891, + "step": 16468 + }, + { + "epoch": 0.263504, + "grad_norm": 0.6484375, + "learning_rate": 7.424516129032259e-05, + "loss": 0.1632, + "step": 16469 + }, + { + "epoch": 0.26352, + "grad_norm": 0.8671875, + "learning_rate": 7.424354838709677e-05, + "loss": 0.1959, + "step": 16470 + }, + { + "epoch": 0.263536, + "grad_norm": 1.0859375, + "learning_rate": 7.424193548387097e-05, + "loss": 0.1375, + "step": 16471 + }, + { + "epoch": 0.263552, + "grad_norm": 1.328125, + "learning_rate": 7.424032258064516e-05, + "loss": 0.1716, + "step": 16472 + }, + { + "epoch": 0.263568, + "grad_norm": 1.0, + "learning_rate": 7.423870967741936e-05, + "loss": 0.1618, + "step": 16473 + }, + { + "epoch": 0.263584, + "grad_norm": 0.859375, + "learning_rate": 7.423709677419355e-05, + "loss": 0.1548, + "step": 16474 + }, + { + "epoch": 0.2636, + "grad_norm": 1.0546875, + "learning_rate": 7.423548387096774e-05, + "loss": 0.1803, + "step": 16475 + }, + { + "epoch": 0.263616, + "grad_norm": 1.1171875, + "learning_rate": 7.423387096774193e-05, + "loss": 0.1587, + "step": 16476 + }, + { + "epoch": 0.263632, + "grad_norm": 0.4609375, + "learning_rate": 7.423225806451613e-05, + "loss": 0.1541, + "step": 16477 + }, + { + "epoch": 0.263648, + "grad_norm": 0.71484375, + "learning_rate": 7.423064516129033e-05, + "loss": 0.2123, + "step": 16478 + }, + { + "epoch": 0.263664, + "grad_norm": 1.1640625, + "learning_rate": 7.422903225806453e-05, + "loss": 0.1464, + "step": 16479 + }, + { + "epoch": 0.26368, + "grad_norm": 0.6484375, + "learning_rate": 7.422741935483872e-05, + "loss": 0.1543, + "step": 16480 + }, + { + "epoch": 0.263696, + "grad_norm": 0.84765625, + "learning_rate": 7.422580645161292e-05, + "loss": 0.2437, + "step": 16481 + }, + { + "epoch": 0.263712, + "grad_norm": 0.76953125, + "learning_rate": 7.42241935483871e-05, + "loss": 0.2008, + "step": 16482 + }, + { + "epoch": 0.263728, + "grad_norm": 1.265625, + "learning_rate": 7.42225806451613e-05, + "loss": 0.1471, + "step": 16483 + }, + { + "epoch": 0.263744, + "grad_norm": 0.5234375, + "learning_rate": 7.422096774193549e-05, + "loss": 0.1356, + "step": 16484 + }, + { + "epoch": 0.26376, + "grad_norm": 1.6953125, + "learning_rate": 7.421935483870967e-05, + "loss": 0.182, + "step": 16485 + }, + { + "epoch": 0.263776, + "grad_norm": 0.7109375, + "learning_rate": 7.421774193548387e-05, + "loss": 0.1344, + "step": 16486 + }, + { + "epoch": 0.263792, + "grad_norm": 0.70703125, + "learning_rate": 7.421612903225806e-05, + "loss": 0.1194, + "step": 16487 + }, + { + "epoch": 0.263808, + "grad_norm": 0.62109375, + "learning_rate": 7.421451612903226e-05, + "loss": 0.1646, + "step": 16488 + }, + { + "epoch": 0.263824, + "grad_norm": 0.91015625, + "learning_rate": 7.421290322580646e-05, + "loss": 0.1624, + "step": 16489 + }, + { + "epoch": 0.26384, + "grad_norm": 1.0546875, + "learning_rate": 7.421129032258064e-05, + "loss": 0.1621, + "step": 16490 + }, + { + "epoch": 0.263856, + "grad_norm": 0.65625, + "learning_rate": 7.420967741935484e-05, + "loss": 0.1816, + "step": 16491 + }, + { + "epoch": 0.263872, + "grad_norm": 0.59765625, + "learning_rate": 7.420806451612904e-05, + "loss": 0.157, + "step": 16492 + }, + { + "epoch": 0.263888, + "grad_norm": 0.6796875, + "learning_rate": 7.420645161290323e-05, + "loss": 0.1544, + "step": 16493 + }, + { + "epoch": 0.263904, + "grad_norm": 1.09375, + "learning_rate": 7.420483870967743e-05, + "loss": 0.1648, + "step": 16494 + }, + { + "epoch": 0.26392, + "grad_norm": 0.98828125, + "learning_rate": 7.420322580645162e-05, + "loss": 0.1828, + "step": 16495 + }, + { + "epoch": 0.263936, + "grad_norm": 1.0, + "learning_rate": 7.420161290322582e-05, + "loss": 0.1821, + "step": 16496 + }, + { + "epoch": 0.263952, + "grad_norm": 0.99609375, + "learning_rate": 7.42e-05, + "loss": 0.1739, + "step": 16497 + }, + { + "epoch": 0.263968, + "grad_norm": 0.7421875, + "learning_rate": 7.41983870967742e-05, + "loss": 0.1869, + "step": 16498 + }, + { + "epoch": 0.263984, + "grad_norm": 0.8828125, + "learning_rate": 7.419677419354839e-05, + "loss": 0.1965, + "step": 16499 + }, + { + "epoch": 0.264, + "grad_norm": 0.64453125, + "learning_rate": 7.419516129032257e-05, + "loss": 0.1712, + "step": 16500 + }, + { + "epoch": 0.264016, + "grad_norm": 0.86328125, + "learning_rate": 7.419354838709677e-05, + "loss": 0.1489, + "step": 16501 + }, + { + "epoch": 0.264032, + "grad_norm": 0.87109375, + "learning_rate": 7.419193548387097e-05, + "loss": 0.1486, + "step": 16502 + }, + { + "epoch": 0.264048, + "grad_norm": 0.671875, + "learning_rate": 7.419032258064517e-05, + "loss": 0.162, + "step": 16503 + }, + { + "epoch": 0.264064, + "grad_norm": 1.203125, + "learning_rate": 7.418870967741936e-05, + "loss": 0.148, + "step": 16504 + }, + { + "epoch": 0.26408, + "grad_norm": 1.109375, + "learning_rate": 7.418709677419356e-05, + "loss": 0.1575, + "step": 16505 + }, + { + "epoch": 0.264096, + "grad_norm": 0.69921875, + "learning_rate": 7.418548387096774e-05, + "loss": 0.1518, + "step": 16506 + }, + { + "epoch": 0.264112, + "grad_norm": 0.61328125, + "learning_rate": 7.418387096774194e-05, + "loss": 0.1485, + "step": 16507 + }, + { + "epoch": 0.264128, + "grad_norm": 0.859375, + "learning_rate": 7.418225806451613e-05, + "loss": 0.1432, + "step": 16508 + }, + { + "epoch": 0.264144, + "grad_norm": 0.88671875, + "learning_rate": 7.418064516129033e-05, + "loss": 0.1559, + "step": 16509 + }, + { + "epoch": 0.26416, + "grad_norm": 1.25, + "learning_rate": 7.417903225806451e-05, + "loss": 0.1254, + "step": 16510 + }, + { + "epoch": 0.264176, + "grad_norm": 0.62890625, + "learning_rate": 7.417741935483871e-05, + "loss": 0.196, + "step": 16511 + }, + { + "epoch": 0.264192, + "grad_norm": 0.5234375, + "learning_rate": 7.41758064516129e-05, + "loss": 0.1404, + "step": 16512 + }, + { + "epoch": 0.264208, + "grad_norm": 1.5546875, + "learning_rate": 7.41741935483871e-05, + "loss": 0.1684, + "step": 16513 + }, + { + "epoch": 0.264224, + "grad_norm": 0.82421875, + "learning_rate": 7.41725806451613e-05, + "loss": 0.1715, + "step": 16514 + }, + { + "epoch": 0.26424, + "grad_norm": 0.6796875, + "learning_rate": 7.417096774193549e-05, + "loss": 0.1533, + "step": 16515 + }, + { + "epoch": 0.264256, + "grad_norm": 0.7578125, + "learning_rate": 7.416935483870969e-05, + "loss": 0.1713, + "step": 16516 + }, + { + "epoch": 0.264272, + "grad_norm": 0.578125, + "learning_rate": 7.416774193548387e-05, + "loss": 0.1276, + "step": 16517 + }, + { + "epoch": 0.264288, + "grad_norm": 0.765625, + "learning_rate": 7.416612903225807e-05, + "loss": 0.1734, + "step": 16518 + }, + { + "epoch": 0.264304, + "grad_norm": 0.94140625, + "learning_rate": 7.416451612903226e-05, + "loss": 0.1474, + "step": 16519 + }, + { + "epoch": 0.26432, + "grad_norm": 1.953125, + "learning_rate": 7.416290322580646e-05, + "loss": 0.1799, + "step": 16520 + }, + { + "epoch": 0.264336, + "grad_norm": 1.1953125, + "learning_rate": 7.416129032258064e-05, + "loss": 0.1631, + "step": 16521 + }, + { + "epoch": 0.264352, + "grad_norm": 0.60546875, + "learning_rate": 7.415967741935484e-05, + "loss": 0.1588, + "step": 16522 + }, + { + "epoch": 0.264368, + "grad_norm": 0.9375, + "learning_rate": 7.415806451612903e-05, + "loss": 0.1538, + "step": 16523 + }, + { + "epoch": 0.264384, + "grad_norm": 0.7578125, + "learning_rate": 7.415645161290323e-05, + "loss": 0.1562, + "step": 16524 + }, + { + "epoch": 0.2644, + "grad_norm": 0.80859375, + "learning_rate": 7.415483870967741e-05, + "loss": 0.1697, + "step": 16525 + }, + { + "epoch": 0.264416, + "grad_norm": 0.828125, + "learning_rate": 7.415322580645161e-05, + "loss": 0.2066, + "step": 16526 + }, + { + "epoch": 0.264432, + "grad_norm": 0.734375, + "learning_rate": 7.415161290322581e-05, + "loss": 0.1599, + "step": 16527 + }, + { + "epoch": 0.264448, + "grad_norm": 0.7265625, + "learning_rate": 7.415000000000001e-05, + "loss": 0.1746, + "step": 16528 + }, + { + "epoch": 0.264464, + "grad_norm": 0.80078125, + "learning_rate": 7.41483870967742e-05, + "loss": 0.1361, + "step": 16529 + }, + { + "epoch": 0.26448, + "grad_norm": 0.6015625, + "learning_rate": 7.41467741935484e-05, + "loss": 0.1576, + "step": 16530 + }, + { + "epoch": 0.264496, + "grad_norm": 0.71484375, + "learning_rate": 7.414516129032259e-05, + "loss": 0.1949, + "step": 16531 + }, + { + "epoch": 0.264512, + "grad_norm": 0.8203125, + "learning_rate": 7.414354838709677e-05, + "loss": 0.1755, + "step": 16532 + }, + { + "epoch": 0.264528, + "grad_norm": 0.7421875, + "learning_rate": 7.414193548387097e-05, + "loss": 0.1901, + "step": 16533 + }, + { + "epoch": 0.264544, + "grad_norm": 1.0703125, + "learning_rate": 7.414032258064516e-05, + "loss": 0.1985, + "step": 16534 + }, + { + "epoch": 0.26456, + "grad_norm": 0.671875, + "learning_rate": 7.413870967741936e-05, + "loss": 0.1681, + "step": 16535 + }, + { + "epoch": 0.264576, + "grad_norm": 0.65234375, + "learning_rate": 7.413709677419354e-05, + "loss": 0.16, + "step": 16536 + }, + { + "epoch": 0.264592, + "grad_norm": 0.75, + "learning_rate": 7.413548387096774e-05, + "loss": 0.1576, + "step": 16537 + }, + { + "epoch": 0.264608, + "grad_norm": 0.8203125, + "learning_rate": 7.413387096774194e-05, + "loss": 0.1249, + "step": 16538 + }, + { + "epoch": 0.264624, + "grad_norm": 0.6796875, + "learning_rate": 7.413225806451614e-05, + "loss": 0.166, + "step": 16539 + }, + { + "epoch": 0.26464, + "grad_norm": 0.62109375, + "learning_rate": 7.413064516129033e-05, + "loss": 0.1767, + "step": 16540 + }, + { + "epoch": 0.264656, + "grad_norm": 0.625, + "learning_rate": 7.412903225806453e-05, + "loss": 0.1616, + "step": 16541 + }, + { + "epoch": 0.264672, + "grad_norm": 0.6875, + "learning_rate": 7.412741935483871e-05, + "loss": 0.1601, + "step": 16542 + }, + { + "epoch": 0.264688, + "grad_norm": 0.6171875, + "learning_rate": 7.412580645161291e-05, + "loss": 0.1593, + "step": 16543 + }, + { + "epoch": 0.264704, + "grad_norm": 0.62109375, + "learning_rate": 7.41241935483871e-05, + "loss": 0.1372, + "step": 16544 + }, + { + "epoch": 0.26472, + "grad_norm": 0.78515625, + "learning_rate": 7.41225806451613e-05, + "loss": 0.1716, + "step": 16545 + }, + { + "epoch": 0.264736, + "grad_norm": 0.9296875, + "learning_rate": 7.412096774193548e-05, + "loss": 0.1595, + "step": 16546 + }, + { + "epoch": 0.264752, + "grad_norm": 0.671875, + "learning_rate": 7.411935483870967e-05, + "loss": 0.1917, + "step": 16547 + }, + { + "epoch": 0.264768, + "grad_norm": 0.78515625, + "learning_rate": 7.411774193548387e-05, + "loss": 0.1962, + "step": 16548 + }, + { + "epoch": 0.264784, + "grad_norm": 0.5546875, + "learning_rate": 7.411612903225807e-05, + "loss": 0.1225, + "step": 16549 + }, + { + "epoch": 0.2648, + "grad_norm": 0.79296875, + "learning_rate": 7.411451612903227e-05, + "loss": 0.1965, + "step": 16550 + }, + { + "epoch": 0.264816, + "grad_norm": 0.61328125, + "learning_rate": 7.411290322580646e-05, + "loss": 0.1703, + "step": 16551 + }, + { + "epoch": 0.264832, + "grad_norm": 0.76953125, + "learning_rate": 7.411129032258066e-05, + "loss": 0.1854, + "step": 16552 + }, + { + "epoch": 0.264848, + "grad_norm": 0.57421875, + "learning_rate": 7.410967741935484e-05, + "loss": 0.1625, + "step": 16553 + }, + { + "epoch": 0.264864, + "grad_norm": 0.95703125, + "learning_rate": 7.410806451612904e-05, + "loss": 0.1993, + "step": 16554 + }, + { + "epoch": 0.26488, + "grad_norm": 0.6328125, + "learning_rate": 7.410645161290323e-05, + "loss": 0.1452, + "step": 16555 + }, + { + "epoch": 0.264896, + "grad_norm": 0.921875, + "learning_rate": 7.410483870967743e-05, + "loss": 0.1642, + "step": 16556 + }, + { + "epoch": 0.264912, + "grad_norm": 0.78515625, + "learning_rate": 7.410322580645161e-05, + "loss": 0.1589, + "step": 16557 + }, + { + "epoch": 0.264928, + "grad_norm": 0.6015625, + "learning_rate": 7.410161290322581e-05, + "loss": 0.1555, + "step": 16558 + }, + { + "epoch": 0.264944, + "grad_norm": 0.59375, + "learning_rate": 7.41e-05, + "loss": 0.1447, + "step": 16559 + }, + { + "epoch": 0.26496, + "grad_norm": 0.9609375, + "learning_rate": 7.40983870967742e-05, + "loss": 0.1688, + "step": 16560 + }, + { + "epoch": 0.264976, + "grad_norm": 0.80859375, + "learning_rate": 7.409677419354838e-05, + "loss": 0.1674, + "step": 16561 + }, + { + "epoch": 0.264992, + "grad_norm": 0.60546875, + "learning_rate": 7.409516129032258e-05, + "loss": 0.1222, + "step": 16562 + }, + { + "epoch": 0.265008, + "grad_norm": 0.9765625, + "learning_rate": 7.409354838709678e-05, + "loss": 0.1838, + "step": 16563 + }, + { + "epoch": 0.265024, + "grad_norm": 0.7109375, + "learning_rate": 7.409193548387097e-05, + "loss": 0.1581, + "step": 16564 + }, + { + "epoch": 0.26504, + "grad_norm": 0.875, + "learning_rate": 7.409032258064517e-05, + "loss": 0.1863, + "step": 16565 + }, + { + "epoch": 0.265056, + "grad_norm": 0.75390625, + "learning_rate": 7.408870967741936e-05, + "loss": 0.1327, + "step": 16566 + }, + { + "epoch": 0.265072, + "grad_norm": 0.6171875, + "learning_rate": 7.408709677419356e-05, + "loss": 0.1629, + "step": 16567 + }, + { + "epoch": 0.265088, + "grad_norm": 0.69921875, + "learning_rate": 7.408548387096774e-05, + "loss": 0.1962, + "step": 16568 + }, + { + "epoch": 0.265104, + "grad_norm": 0.62890625, + "learning_rate": 7.408387096774194e-05, + "loss": 0.1466, + "step": 16569 + }, + { + "epoch": 0.26512, + "grad_norm": 0.7734375, + "learning_rate": 7.408225806451613e-05, + "loss": 0.1425, + "step": 16570 + }, + { + "epoch": 0.265136, + "grad_norm": 0.8046875, + "learning_rate": 7.408064516129033e-05, + "loss": 0.1678, + "step": 16571 + }, + { + "epoch": 0.265152, + "grad_norm": 0.88671875, + "learning_rate": 7.407903225806451e-05, + "loss": 0.1467, + "step": 16572 + }, + { + "epoch": 0.265168, + "grad_norm": 0.68359375, + "learning_rate": 7.407741935483871e-05, + "loss": 0.1539, + "step": 16573 + }, + { + "epoch": 0.265184, + "grad_norm": 0.71484375, + "learning_rate": 7.407580645161291e-05, + "loss": 0.1731, + "step": 16574 + }, + { + "epoch": 0.2652, + "grad_norm": 0.734375, + "learning_rate": 7.407419354838711e-05, + "loss": 0.1391, + "step": 16575 + }, + { + "epoch": 0.265216, + "grad_norm": 0.57421875, + "learning_rate": 7.40725806451613e-05, + "loss": 0.1231, + "step": 16576 + }, + { + "epoch": 0.265232, + "grad_norm": 0.57421875, + "learning_rate": 7.407096774193548e-05, + "loss": 0.118, + "step": 16577 + }, + { + "epoch": 0.265248, + "grad_norm": 0.80078125, + "learning_rate": 7.406935483870968e-05, + "loss": 0.1905, + "step": 16578 + }, + { + "epoch": 0.265264, + "grad_norm": 1.453125, + "learning_rate": 7.406774193548387e-05, + "loss": 0.2201, + "step": 16579 + }, + { + "epoch": 0.26528, + "grad_norm": 0.77734375, + "learning_rate": 7.406612903225807e-05, + "loss": 0.1658, + "step": 16580 + }, + { + "epoch": 0.265296, + "grad_norm": 1.09375, + "learning_rate": 7.406451612903225e-05, + "loss": 0.1453, + "step": 16581 + }, + { + "epoch": 0.265312, + "grad_norm": 1.0859375, + "learning_rate": 7.406290322580645e-05, + "loss": 0.1743, + "step": 16582 + }, + { + "epoch": 0.265328, + "grad_norm": 1.09375, + "learning_rate": 7.406129032258064e-05, + "loss": 0.1714, + "step": 16583 + }, + { + "epoch": 0.265344, + "grad_norm": 0.63671875, + "learning_rate": 7.405967741935484e-05, + "loss": 0.1361, + "step": 16584 + }, + { + "epoch": 0.26536, + "grad_norm": 0.89453125, + "learning_rate": 7.405806451612904e-05, + "loss": 0.1609, + "step": 16585 + }, + { + "epoch": 0.265376, + "grad_norm": 0.75, + "learning_rate": 7.405645161290323e-05, + "loss": 0.1632, + "step": 16586 + }, + { + "epoch": 0.265392, + "grad_norm": 0.94921875, + "learning_rate": 7.405483870967743e-05, + "loss": 0.153, + "step": 16587 + }, + { + "epoch": 0.265408, + "grad_norm": 0.54296875, + "learning_rate": 7.405322580645163e-05, + "loss": 0.1571, + "step": 16588 + }, + { + "epoch": 0.265424, + "grad_norm": 0.88671875, + "learning_rate": 7.405161290322581e-05, + "loss": 0.2007, + "step": 16589 + }, + { + "epoch": 0.26544, + "grad_norm": 0.6953125, + "learning_rate": 7.405000000000001e-05, + "loss": 0.1424, + "step": 16590 + }, + { + "epoch": 0.265456, + "grad_norm": 1.171875, + "learning_rate": 7.40483870967742e-05, + "loss": 0.1675, + "step": 16591 + }, + { + "epoch": 0.265472, + "grad_norm": 0.71484375, + "learning_rate": 7.40467741935484e-05, + "loss": 0.1114, + "step": 16592 + }, + { + "epoch": 0.265488, + "grad_norm": 1.0703125, + "learning_rate": 7.404516129032258e-05, + "loss": 0.2032, + "step": 16593 + }, + { + "epoch": 0.265504, + "grad_norm": 1.6484375, + "learning_rate": 7.404354838709677e-05, + "loss": 0.1239, + "step": 16594 + }, + { + "epoch": 0.26552, + "grad_norm": 0.74609375, + "learning_rate": 7.404193548387097e-05, + "loss": 0.176, + "step": 16595 + }, + { + "epoch": 0.265536, + "grad_norm": 0.99609375, + "learning_rate": 7.404032258064515e-05, + "loss": 0.1717, + "step": 16596 + }, + { + "epoch": 0.265552, + "grad_norm": 1.6640625, + "learning_rate": 7.403870967741935e-05, + "loss": 0.1761, + "step": 16597 + }, + { + "epoch": 0.265568, + "grad_norm": 0.80859375, + "learning_rate": 7.403709677419355e-05, + "loss": 0.192, + "step": 16598 + }, + { + "epoch": 0.265584, + "grad_norm": 1.0703125, + "learning_rate": 7.403548387096775e-05, + "loss": 0.1785, + "step": 16599 + }, + { + "epoch": 0.2656, + "grad_norm": 0.78515625, + "learning_rate": 7.403387096774194e-05, + "loss": 0.1445, + "step": 16600 + }, + { + "epoch": 0.265616, + "grad_norm": 0.94921875, + "learning_rate": 7.403225806451614e-05, + "loss": 0.1622, + "step": 16601 + }, + { + "epoch": 0.265632, + "grad_norm": 1.2734375, + "learning_rate": 7.403064516129033e-05, + "loss": 0.1893, + "step": 16602 + }, + { + "epoch": 0.265648, + "grad_norm": 0.80078125, + "learning_rate": 7.402903225806452e-05, + "loss": 0.2039, + "step": 16603 + }, + { + "epoch": 0.265664, + "grad_norm": 1.703125, + "learning_rate": 7.402741935483871e-05, + "loss": 0.2261, + "step": 16604 + }, + { + "epoch": 0.26568, + "grad_norm": 1.078125, + "learning_rate": 7.402580645161291e-05, + "loss": 0.1602, + "step": 16605 + }, + { + "epoch": 0.265696, + "grad_norm": 0.84765625, + "learning_rate": 7.40241935483871e-05, + "loss": 0.1833, + "step": 16606 + }, + { + "epoch": 0.265712, + "grad_norm": 0.99609375, + "learning_rate": 7.40225806451613e-05, + "loss": 0.1718, + "step": 16607 + }, + { + "epoch": 0.265728, + "grad_norm": 0.62109375, + "learning_rate": 7.402096774193548e-05, + "loss": 0.1265, + "step": 16608 + }, + { + "epoch": 0.265744, + "grad_norm": 0.74609375, + "learning_rate": 7.401935483870968e-05, + "loss": 0.1577, + "step": 16609 + }, + { + "epoch": 0.26576, + "grad_norm": 0.96484375, + "learning_rate": 7.401774193548388e-05, + "loss": 0.1772, + "step": 16610 + }, + { + "epoch": 0.265776, + "grad_norm": 0.8359375, + "learning_rate": 7.401612903225807e-05, + "loss": 0.1959, + "step": 16611 + }, + { + "epoch": 0.265792, + "grad_norm": 0.76171875, + "learning_rate": 7.401451612903227e-05, + "loss": 0.2017, + "step": 16612 + }, + { + "epoch": 0.265808, + "grad_norm": 0.8359375, + "learning_rate": 7.401290322580645e-05, + "loss": 0.1916, + "step": 16613 + }, + { + "epoch": 0.265824, + "grad_norm": 0.62109375, + "learning_rate": 7.401129032258065e-05, + "loss": 0.1597, + "step": 16614 + }, + { + "epoch": 0.26584, + "grad_norm": 0.9453125, + "learning_rate": 7.400967741935484e-05, + "loss": 0.1556, + "step": 16615 + }, + { + "epoch": 0.265856, + "grad_norm": 0.59375, + "learning_rate": 7.400806451612904e-05, + "loss": 0.1543, + "step": 16616 + }, + { + "epoch": 0.265872, + "grad_norm": 0.73828125, + "learning_rate": 7.400645161290322e-05, + "loss": 0.1519, + "step": 16617 + }, + { + "epoch": 0.265888, + "grad_norm": 1.1875, + "learning_rate": 7.400483870967742e-05, + "loss": 0.184, + "step": 16618 + }, + { + "epoch": 0.265904, + "grad_norm": 0.9453125, + "learning_rate": 7.400322580645161e-05, + "loss": 0.2072, + "step": 16619 + }, + { + "epoch": 0.26592, + "grad_norm": 1.140625, + "learning_rate": 7.400161290322581e-05, + "loss": 0.1962, + "step": 16620 + }, + { + "epoch": 0.265936, + "grad_norm": 0.67578125, + "learning_rate": 7.4e-05, + "loss": 0.1191, + "step": 16621 + }, + { + "epoch": 0.265952, + "grad_norm": 1.296875, + "learning_rate": 7.39983870967742e-05, + "loss": 0.1672, + "step": 16622 + }, + { + "epoch": 0.265968, + "grad_norm": 1.2265625, + "learning_rate": 7.39967741935484e-05, + "loss": 0.1458, + "step": 16623 + }, + { + "epoch": 0.265984, + "grad_norm": 1.1171875, + "learning_rate": 7.399516129032258e-05, + "loss": 0.1704, + "step": 16624 + }, + { + "epoch": 0.266, + "grad_norm": 0.76171875, + "learning_rate": 7.399354838709678e-05, + "loss": 0.1558, + "step": 16625 + }, + { + "epoch": 0.266016, + "grad_norm": 0.8203125, + "learning_rate": 7.399193548387097e-05, + "loss": 0.2171, + "step": 16626 + }, + { + "epoch": 0.266032, + "grad_norm": 1.3359375, + "learning_rate": 7.399032258064517e-05, + "loss": 0.1746, + "step": 16627 + }, + { + "epoch": 0.266048, + "grad_norm": 0.74609375, + "learning_rate": 7.398870967741935e-05, + "loss": 0.1725, + "step": 16628 + }, + { + "epoch": 0.266064, + "grad_norm": 1.078125, + "learning_rate": 7.398709677419355e-05, + "loss": 0.2159, + "step": 16629 + }, + { + "epoch": 0.26608, + "grad_norm": 0.8046875, + "learning_rate": 7.398548387096774e-05, + "loss": 0.1569, + "step": 16630 + }, + { + "epoch": 0.266096, + "grad_norm": 0.69921875, + "learning_rate": 7.398387096774194e-05, + "loss": 0.1291, + "step": 16631 + }, + { + "epoch": 0.266112, + "grad_norm": 0.83203125, + "learning_rate": 7.398225806451612e-05, + "loss": 0.1646, + "step": 16632 + }, + { + "epoch": 0.266128, + "grad_norm": 1.0078125, + "learning_rate": 7.398064516129032e-05, + "loss": 0.1698, + "step": 16633 + }, + { + "epoch": 0.266144, + "grad_norm": 0.81640625, + "learning_rate": 7.397903225806452e-05, + "loss": 0.1669, + "step": 16634 + }, + { + "epoch": 0.26616, + "grad_norm": 0.69921875, + "learning_rate": 7.397741935483872e-05, + "loss": 0.1342, + "step": 16635 + }, + { + "epoch": 0.266176, + "grad_norm": 0.8671875, + "learning_rate": 7.397580645161291e-05, + "loss": 0.1529, + "step": 16636 + }, + { + "epoch": 0.266192, + "grad_norm": 0.5546875, + "learning_rate": 7.397419354838711e-05, + "loss": 0.1793, + "step": 16637 + }, + { + "epoch": 0.266208, + "grad_norm": 0.67578125, + "learning_rate": 7.39725806451613e-05, + "loss": 0.1675, + "step": 16638 + }, + { + "epoch": 0.266224, + "grad_norm": 0.6484375, + "learning_rate": 7.39709677419355e-05, + "loss": 0.1644, + "step": 16639 + }, + { + "epoch": 0.26624, + "grad_norm": 1.015625, + "learning_rate": 7.396935483870968e-05, + "loss": 0.1979, + "step": 16640 + }, + { + "epoch": 0.266256, + "grad_norm": 0.86328125, + "learning_rate": 7.396774193548387e-05, + "loss": 0.2003, + "step": 16641 + }, + { + "epoch": 0.266272, + "grad_norm": 0.93359375, + "learning_rate": 7.396612903225807e-05, + "loss": 0.2085, + "step": 16642 + }, + { + "epoch": 0.266288, + "grad_norm": 0.99609375, + "learning_rate": 7.396451612903225e-05, + "loss": 0.1729, + "step": 16643 + }, + { + "epoch": 0.266304, + "grad_norm": 0.64453125, + "learning_rate": 7.396290322580645e-05, + "loss": 0.1752, + "step": 16644 + }, + { + "epoch": 0.26632, + "grad_norm": 0.765625, + "learning_rate": 7.396129032258065e-05, + "loss": 0.1648, + "step": 16645 + }, + { + "epoch": 0.266336, + "grad_norm": 0.80078125, + "learning_rate": 7.395967741935485e-05, + "loss": 0.171, + "step": 16646 + }, + { + "epoch": 0.266352, + "grad_norm": 0.890625, + "learning_rate": 7.395806451612904e-05, + "loss": 0.1636, + "step": 16647 + }, + { + "epoch": 0.266368, + "grad_norm": 0.7421875, + "learning_rate": 7.395645161290324e-05, + "loss": 0.1464, + "step": 16648 + }, + { + "epoch": 0.266384, + "grad_norm": 0.7734375, + "learning_rate": 7.395483870967742e-05, + "loss": 0.1645, + "step": 16649 + }, + { + "epoch": 0.2664, + "grad_norm": 1.2109375, + "learning_rate": 7.395322580645162e-05, + "loss": 0.1984, + "step": 16650 + }, + { + "epoch": 0.266416, + "grad_norm": 0.7578125, + "learning_rate": 7.395161290322581e-05, + "loss": 0.1642, + "step": 16651 + }, + { + "epoch": 0.266432, + "grad_norm": 0.66015625, + "learning_rate": 7.395000000000001e-05, + "loss": 0.1526, + "step": 16652 + }, + { + "epoch": 0.266448, + "grad_norm": 1.0546875, + "learning_rate": 7.39483870967742e-05, + "loss": 0.167, + "step": 16653 + }, + { + "epoch": 0.266464, + "grad_norm": 0.80859375, + "learning_rate": 7.39467741935484e-05, + "loss": 0.1629, + "step": 16654 + }, + { + "epoch": 0.26648, + "grad_norm": 0.81640625, + "learning_rate": 7.394516129032258e-05, + "loss": 0.1452, + "step": 16655 + }, + { + "epoch": 0.266496, + "grad_norm": 0.8828125, + "learning_rate": 7.394354838709677e-05, + "loss": 0.1552, + "step": 16656 + }, + { + "epoch": 0.266512, + "grad_norm": 0.7109375, + "learning_rate": 7.394193548387097e-05, + "loss": 0.1823, + "step": 16657 + }, + { + "epoch": 0.266528, + "grad_norm": 0.84375, + "learning_rate": 7.394032258064517e-05, + "loss": 0.1735, + "step": 16658 + }, + { + "epoch": 0.266544, + "grad_norm": 0.71484375, + "learning_rate": 7.393870967741937e-05, + "loss": 0.1715, + "step": 16659 + }, + { + "epoch": 0.26656, + "grad_norm": 0.796875, + "learning_rate": 7.393709677419355e-05, + "loss": 0.1664, + "step": 16660 + }, + { + "epoch": 0.266576, + "grad_norm": 0.80859375, + "learning_rate": 7.393548387096775e-05, + "loss": 0.1428, + "step": 16661 + }, + { + "epoch": 0.266592, + "grad_norm": 0.8828125, + "learning_rate": 7.393387096774194e-05, + "loss": 0.1642, + "step": 16662 + }, + { + "epoch": 0.266608, + "grad_norm": 0.53515625, + "learning_rate": 7.393225806451614e-05, + "loss": 0.1644, + "step": 16663 + }, + { + "epoch": 0.266624, + "grad_norm": 0.73828125, + "learning_rate": 7.393064516129032e-05, + "loss": 0.1746, + "step": 16664 + }, + { + "epoch": 0.26664, + "grad_norm": 1.078125, + "learning_rate": 7.392903225806452e-05, + "loss": 0.1375, + "step": 16665 + }, + { + "epoch": 0.266656, + "grad_norm": 1.4921875, + "learning_rate": 7.392741935483871e-05, + "loss": 0.1756, + "step": 16666 + }, + { + "epoch": 0.266672, + "grad_norm": 1.0625, + "learning_rate": 7.392580645161291e-05, + "loss": 0.156, + "step": 16667 + }, + { + "epoch": 0.266688, + "grad_norm": 0.8515625, + "learning_rate": 7.39241935483871e-05, + "loss": 0.1614, + "step": 16668 + }, + { + "epoch": 0.266704, + "grad_norm": 1.171875, + "learning_rate": 7.39225806451613e-05, + "loss": 0.2095, + "step": 16669 + }, + { + "epoch": 0.26672, + "grad_norm": 0.92578125, + "learning_rate": 7.39209677419355e-05, + "loss": 0.1851, + "step": 16670 + }, + { + "epoch": 0.266736, + "grad_norm": 1.0703125, + "learning_rate": 7.391935483870968e-05, + "loss": 0.2052, + "step": 16671 + }, + { + "epoch": 0.266752, + "grad_norm": 0.640625, + "learning_rate": 7.391774193548388e-05, + "loss": 0.1958, + "step": 16672 + }, + { + "epoch": 0.266768, + "grad_norm": 0.58203125, + "learning_rate": 7.391612903225807e-05, + "loss": 0.1718, + "step": 16673 + }, + { + "epoch": 0.266784, + "grad_norm": 0.68359375, + "learning_rate": 7.391451612903226e-05, + "loss": 0.1743, + "step": 16674 + }, + { + "epoch": 0.2668, + "grad_norm": 0.6796875, + "learning_rate": 7.391290322580645e-05, + "loss": 0.153, + "step": 16675 + }, + { + "epoch": 0.266816, + "grad_norm": 0.6640625, + "learning_rate": 7.391129032258065e-05, + "loss": 0.1814, + "step": 16676 + }, + { + "epoch": 0.266832, + "grad_norm": 0.62109375, + "learning_rate": 7.390967741935484e-05, + "loss": 0.143, + "step": 16677 + }, + { + "epoch": 0.266848, + "grad_norm": 0.62109375, + "learning_rate": 7.390806451612904e-05, + "loss": 0.1725, + "step": 16678 + }, + { + "epoch": 0.266864, + "grad_norm": 0.9375, + "learning_rate": 7.390645161290322e-05, + "loss": 0.1528, + "step": 16679 + }, + { + "epoch": 0.26688, + "grad_norm": 0.51953125, + "learning_rate": 7.390483870967742e-05, + "loss": 0.1632, + "step": 16680 + }, + { + "epoch": 0.266896, + "grad_norm": 0.5625, + "learning_rate": 7.390322580645161e-05, + "loss": 0.1527, + "step": 16681 + }, + { + "epoch": 0.266912, + "grad_norm": 0.6171875, + "learning_rate": 7.390161290322581e-05, + "loss": 0.1583, + "step": 16682 + }, + { + "epoch": 0.266928, + "grad_norm": 1.0078125, + "learning_rate": 7.390000000000001e-05, + "loss": 0.2007, + "step": 16683 + }, + { + "epoch": 0.266944, + "grad_norm": 1.0234375, + "learning_rate": 7.389838709677421e-05, + "loss": 0.2112, + "step": 16684 + }, + { + "epoch": 0.26696, + "grad_norm": 0.828125, + "learning_rate": 7.389677419354839e-05, + "loss": 0.1373, + "step": 16685 + }, + { + "epoch": 0.266976, + "grad_norm": 0.6640625, + "learning_rate": 7.389516129032258e-05, + "loss": 0.1685, + "step": 16686 + }, + { + "epoch": 0.266992, + "grad_norm": 1.3203125, + "learning_rate": 7.389354838709678e-05, + "loss": 0.1908, + "step": 16687 + }, + { + "epoch": 0.267008, + "grad_norm": 1.0078125, + "learning_rate": 7.389193548387096e-05, + "loss": 0.1985, + "step": 16688 + }, + { + "epoch": 0.267024, + "grad_norm": 0.78125, + "learning_rate": 7.389032258064516e-05, + "loss": 0.1921, + "step": 16689 + }, + { + "epoch": 0.26704, + "grad_norm": 0.55078125, + "learning_rate": 7.388870967741935e-05, + "loss": 0.1317, + "step": 16690 + }, + { + "epoch": 0.267056, + "grad_norm": 0.68359375, + "learning_rate": 7.388709677419355e-05, + "loss": 0.1672, + "step": 16691 + }, + { + "epoch": 0.267072, + "grad_norm": 0.7890625, + "learning_rate": 7.388548387096774e-05, + "loss": 0.1763, + "step": 16692 + }, + { + "epoch": 0.267088, + "grad_norm": 0.58203125, + "learning_rate": 7.388387096774194e-05, + "loss": 0.1556, + "step": 16693 + }, + { + "epoch": 0.267104, + "grad_norm": 0.76953125, + "learning_rate": 7.388225806451614e-05, + "loss": 0.1478, + "step": 16694 + }, + { + "epoch": 0.26712, + "grad_norm": 0.69921875, + "learning_rate": 7.388064516129034e-05, + "loss": 0.1352, + "step": 16695 + }, + { + "epoch": 0.267136, + "grad_norm": 0.83984375, + "learning_rate": 7.387903225806452e-05, + "loss": 0.165, + "step": 16696 + }, + { + "epoch": 0.267152, + "grad_norm": 1.8125, + "learning_rate": 7.387741935483872e-05, + "loss": 0.1561, + "step": 16697 + }, + { + "epoch": 0.267168, + "grad_norm": 1.453125, + "learning_rate": 7.387580645161291e-05, + "loss": 0.1721, + "step": 16698 + }, + { + "epoch": 0.267184, + "grad_norm": 0.71484375, + "learning_rate": 7.38741935483871e-05, + "loss": 0.1607, + "step": 16699 + }, + { + "epoch": 0.2672, + "grad_norm": 0.921875, + "learning_rate": 7.387258064516129e-05, + "loss": 0.1434, + "step": 16700 + }, + { + "epoch": 0.267216, + "grad_norm": 0.578125, + "learning_rate": 7.387096774193549e-05, + "loss": 0.1559, + "step": 16701 + }, + { + "epoch": 0.267232, + "grad_norm": 0.9921875, + "learning_rate": 7.386935483870968e-05, + "loss": 0.1713, + "step": 16702 + }, + { + "epoch": 0.267248, + "grad_norm": 0.75, + "learning_rate": 7.386774193548386e-05, + "loss": 0.1412, + "step": 16703 + }, + { + "epoch": 0.267264, + "grad_norm": 0.71875, + "learning_rate": 7.386612903225806e-05, + "loss": 0.1829, + "step": 16704 + }, + { + "epoch": 0.26728, + "grad_norm": 0.73828125, + "learning_rate": 7.386451612903226e-05, + "loss": 0.2286, + "step": 16705 + }, + { + "epoch": 0.267296, + "grad_norm": 0.65234375, + "learning_rate": 7.386290322580646e-05, + "loss": 0.1283, + "step": 16706 + }, + { + "epoch": 0.267312, + "grad_norm": 1.078125, + "learning_rate": 7.386129032258065e-05, + "loss": 0.166, + "step": 16707 + }, + { + "epoch": 0.267328, + "grad_norm": 0.6171875, + "learning_rate": 7.385967741935485e-05, + "loss": 0.1711, + "step": 16708 + }, + { + "epoch": 0.267344, + "grad_norm": 1.40625, + "learning_rate": 7.385806451612904e-05, + "loss": 0.1653, + "step": 16709 + }, + { + "epoch": 0.26736, + "grad_norm": 0.7109375, + "learning_rate": 7.385645161290323e-05, + "loss": 0.1841, + "step": 16710 + }, + { + "epoch": 0.267376, + "grad_norm": 0.9140625, + "learning_rate": 7.385483870967742e-05, + "loss": 0.2216, + "step": 16711 + }, + { + "epoch": 0.267392, + "grad_norm": 0.59765625, + "learning_rate": 7.385322580645162e-05, + "loss": 0.1824, + "step": 16712 + }, + { + "epoch": 0.267408, + "grad_norm": 0.98046875, + "learning_rate": 7.38516129032258e-05, + "loss": 0.1783, + "step": 16713 + }, + { + "epoch": 0.267424, + "grad_norm": 0.63671875, + "learning_rate": 7.385e-05, + "loss": 0.1668, + "step": 16714 + }, + { + "epoch": 0.26744, + "grad_norm": 0.8046875, + "learning_rate": 7.384838709677419e-05, + "loss": 0.1691, + "step": 16715 + }, + { + "epoch": 0.267456, + "grad_norm": 0.8046875, + "learning_rate": 7.384677419354839e-05, + "loss": 0.1645, + "step": 16716 + }, + { + "epoch": 0.267472, + "grad_norm": 0.9609375, + "learning_rate": 7.384516129032258e-05, + "loss": 0.1695, + "step": 16717 + }, + { + "epoch": 0.267488, + "grad_norm": 0.5390625, + "learning_rate": 7.384354838709678e-05, + "loss": 0.1814, + "step": 16718 + }, + { + "epoch": 0.267504, + "grad_norm": 0.5859375, + "learning_rate": 7.384193548387098e-05, + "loss": 0.1763, + "step": 16719 + }, + { + "epoch": 0.26752, + "grad_norm": 0.70703125, + "learning_rate": 7.384032258064516e-05, + "loss": 0.1525, + "step": 16720 + }, + { + "epoch": 0.267536, + "grad_norm": 1.0859375, + "learning_rate": 7.383870967741936e-05, + "loss": 0.2387, + "step": 16721 + }, + { + "epoch": 0.267552, + "grad_norm": 1.4609375, + "learning_rate": 7.383709677419355e-05, + "loss": 0.1653, + "step": 16722 + }, + { + "epoch": 0.267568, + "grad_norm": 1.0078125, + "learning_rate": 7.383548387096775e-05, + "loss": 0.1587, + "step": 16723 + }, + { + "epoch": 0.267584, + "grad_norm": 0.640625, + "learning_rate": 7.383387096774193e-05, + "loss": 0.1818, + "step": 16724 + }, + { + "epoch": 0.2676, + "grad_norm": 1.0078125, + "learning_rate": 7.383225806451613e-05, + "loss": 0.1544, + "step": 16725 + }, + { + "epoch": 0.267616, + "grad_norm": 1.1484375, + "learning_rate": 7.383064516129032e-05, + "loss": 0.2225, + "step": 16726 + }, + { + "epoch": 0.267632, + "grad_norm": 0.75390625, + "learning_rate": 7.382903225806452e-05, + "loss": 0.1776, + "step": 16727 + }, + { + "epoch": 0.267648, + "grad_norm": 0.82421875, + "learning_rate": 7.38274193548387e-05, + "loss": 0.17, + "step": 16728 + }, + { + "epoch": 0.267664, + "grad_norm": 1.1484375, + "learning_rate": 7.38258064516129e-05, + "loss": 0.2132, + "step": 16729 + }, + { + "epoch": 0.26768, + "grad_norm": 0.859375, + "learning_rate": 7.38241935483871e-05, + "loss": 0.2105, + "step": 16730 + }, + { + "epoch": 0.267696, + "grad_norm": 0.69921875, + "learning_rate": 7.38225806451613e-05, + "loss": 0.2179, + "step": 16731 + }, + { + "epoch": 0.267712, + "grad_norm": 0.80078125, + "learning_rate": 7.382096774193549e-05, + "loss": 0.1312, + "step": 16732 + }, + { + "epoch": 0.267728, + "grad_norm": 0.6015625, + "learning_rate": 7.381935483870968e-05, + "loss": 0.1564, + "step": 16733 + }, + { + "epoch": 0.267744, + "grad_norm": 0.6171875, + "learning_rate": 7.381774193548388e-05, + "loss": 0.1687, + "step": 16734 + }, + { + "epoch": 0.26776, + "grad_norm": 0.6796875, + "learning_rate": 7.381612903225806e-05, + "loss": 0.1503, + "step": 16735 + }, + { + "epoch": 0.267776, + "grad_norm": 0.78515625, + "learning_rate": 7.381451612903226e-05, + "loss": 0.1913, + "step": 16736 + }, + { + "epoch": 0.267792, + "grad_norm": 1.109375, + "learning_rate": 7.381290322580645e-05, + "loss": 0.1362, + "step": 16737 + }, + { + "epoch": 0.267808, + "grad_norm": 0.68359375, + "learning_rate": 7.381129032258065e-05, + "loss": 0.1427, + "step": 16738 + }, + { + "epoch": 0.267824, + "grad_norm": 0.6015625, + "learning_rate": 7.380967741935483e-05, + "loss": 0.1505, + "step": 16739 + }, + { + "epoch": 0.26784, + "grad_norm": 0.76171875, + "learning_rate": 7.380806451612903e-05, + "loss": 0.1757, + "step": 16740 + }, + { + "epoch": 0.267856, + "grad_norm": 0.7109375, + "learning_rate": 7.380645161290323e-05, + "loss": 0.1616, + "step": 16741 + }, + { + "epoch": 0.267872, + "grad_norm": 0.609375, + "learning_rate": 7.380483870967742e-05, + "loss": 0.1511, + "step": 16742 + }, + { + "epoch": 0.267888, + "grad_norm": 1.46875, + "learning_rate": 7.380322580645162e-05, + "loss": 0.2475, + "step": 16743 + }, + { + "epoch": 0.267904, + "grad_norm": 0.5390625, + "learning_rate": 7.380161290322582e-05, + "loss": 0.1411, + "step": 16744 + }, + { + "epoch": 0.26792, + "grad_norm": 0.890625, + "learning_rate": 7.38e-05, + "loss": 0.1313, + "step": 16745 + }, + { + "epoch": 0.267936, + "grad_norm": 0.9453125, + "learning_rate": 7.37983870967742e-05, + "loss": 0.1587, + "step": 16746 + }, + { + "epoch": 0.267952, + "grad_norm": 0.65625, + "learning_rate": 7.379677419354839e-05, + "loss": 0.1599, + "step": 16747 + }, + { + "epoch": 0.267968, + "grad_norm": 1.0078125, + "learning_rate": 7.379516129032258e-05, + "loss": 0.187, + "step": 16748 + }, + { + "epoch": 0.267984, + "grad_norm": 1.0859375, + "learning_rate": 7.379354838709678e-05, + "loss": 0.163, + "step": 16749 + }, + { + "epoch": 0.268, + "grad_norm": 0.9609375, + "learning_rate": 7.379193548387096e-05, + "loss": 0.2081, + "step": 16750 + }, + { + "epoch": 0.268016, + "grad_norm": 1.2265625, + "learning_rate": 7.379032258064516e-05, + "loss": 0.1988, + "step": 16751 + }, + { + "epoch": 0.268032, + "grad_norm": 0.734375, + "learning_rate": 7.378870967741935e-05, + "loss": 0.163, + "step": 16752 + }, + { + "epoch": 0.268048, + "grad_norm": 0.5859375, + "learning_rate": 7.378709677419355e-05, + "loss": 0.1307, + "step": 16753 + }, + { + "epoch": 0.268064, + "grad_norm": 0.79296875, + "learning_rate": 7.378548387096775e-05, + "loss": 0.211, + "step": 16754 + }, + { + "epoch": 0.26808, + "grad_norm": 1.03125, + "learning_rate": 7.378387096774195e-05, + "loss": 0.2042, + "step": 16755 + }, + { + "epoch": 0.268096, + "grad_norm": 0.54296875, + "learning_rate": 7.378225806451613e-05, + "loss": 0.1686, + "step": 16756 + }, + { + "epoch": 0.268112, + "grad_norm": 0.76171875, + "learning_rate": 7.378064516129033e-05, + "loss": 0.1737, + "step": 16757 + }, + { + "epoch": 0.268128, + "grad_norm": 1.1171875, + "learning_rate": 7.377903225806452e-05, + "loss": 0.1575, + "step": 16758 + }, + { + "epoch": 0.268144, + "grad_norm": 0.77734375, + "learning_rate": 7.377741935483872e-05, + "loss": 0.193, + "step": 16759 + }, + { + "epoch": 0.26816, + "grad_norm": 1.6953125, + "learning_rate": 7.37758064516129e-05, + "loss": 0.1613, + "step": 16760 + }, + { + "epoch": 0.268176, + "grad_norm": 1.28125, + "learning_rate": 7.37741935483871e-05, + "loss": 0.1784, + "step": 16761 + }, + { + "epoch": 0.268192, + "grad_norm": 0.625, + "learning_rate": 7.377258064516129e-05, + "loss": 0.1558, + "step": 16762 + }, + { + "epoch": 0.268208, + "grad_norm": 0.625, + "learning_rate": 7.377096774193549e-05, + "loss": 0.1629, + "step": 16763 + }, + { + "epoch": 0.268224, + "grad_norm": 0.7265625, + "learning_rate": 7.376935483870968e-05, + "loss": 0.1682, + "step": 16764 + }, + { + "epoch": 0.26824, + "grad_norm": 0.90625, + "learning_rate": 7.376774193548388e-05, + "loss": 0.1808, + "step": 16765 + }, + { + "epoch": 0.268256, + "grad_norm": 0.9375, + "learning_rate": 7.376612903225808e-05, + "loss": 0.1449, + "step": 16766 + }, + { + "epoch": 0.268272, + "grad_norm": 1.28125, + "learning_rate": 7.376451612903226e-05, + "loss": 0.1854, + "step": 16767 + }, + { + "epoch": 0.268288, + "grad_norm": 0.65625, + "learning_rate": 7.376290322580646e-05, + "loss": 0.1544, + "step": 16768 + }, + { + "epoch": 0.268304, + "grad_norm": 1.0859375, + "learning_rate": 7.376129032258065e-05, + "loss": 0.1744, + "step": 16769 + }, + { + "epoch": 0.26832, + "grad_norm": 0.70703125, + "learning_rate": 7.375967741935485e-05, + "loss": 0.1904, + "step": 16770 + }, + { + "epoch": 0.268336, + "grad_norm": 0.859375, + "learning_rate": 7.375806451612903e-05, + "loss": 0.2022, + "step": 16771 + }, + { + "epoch": 0.268352, + "grad_norm": 0.91796875, + "learning_rate": 7.375645161290323e-05, + "loss": 0.1301, + "step": 16772 + }, + { + "epoch": 0.268368, + "grad_norm": 1.109375, + "learning_rate": 7.375483870967742e-05, + "loss": 0.1583, + "step": 16773 + }, + { + "epoch": 0.268384, + "grad_norm": 1.03125, + "learning_rate": 7.375322580645162e-05, + "loss": 0.148, + "step": 16774 + }, + { + "epoch": 0.2684, + "grad_norm": 0.625, + "learning_rate": 7.37516129032258e-05, + "loss": 0.1312, + "step": 16775 + }, + { + "epoch": 0.268416, + "grad_norm": 0.7421875, + "learning_rate": 7.375e-05, + "loss": 0.2286, + "step": 16776 + }, + { + "epoch": 0.268432, + "grad_norm": 0.59765625, + "learning_rate": 7.374838709677419e-05, + "loss": 0.1418, + "step": 16777 + }, + { + "epoch": 0.268448, + "grad_norm": 0.76171875, + "learning_rate": 7.374677419354839e-05, + "loss": 0.1816, + "step": 16778 + }, + { + "epoch": 0.268464, + "grad_norm": 0.87890625, + "learning_rate": 7.374516129032259e-05, + "loss": 0.1935, + "step": 16779 + }, + { + "epoch": 0.26848, + "grad_norm": 1.09375, + "learning_rate": 7.374354838709678e-05, + "loss": 0.1534, + "step": 16780 + }, + { + "epoch": 0.268496, + "grad_norm": 0.93359375, + "learning_rate": 7.374193548387097e-05, + "loss": 0.1627, + "step": 16781 + }, + { + "epoch": 0.268512, + "grad_norm": 0.84765625, + "learning_rate": 7.374032258064516e-05, + "loss": 0.1597, + "step": 16782 + }, + { + "epoch": 0.268528, + "grad_norm": 0.71484375, + "learning_rate": 7.373870967741936e-05, + "loss": 0.1607, + "step": 16783 + }, + { + "epoch": 0.268544, + "grad_norm": 0.9453125, + "learning_rate": 7.373709677419355e-05, + "loss": 0.1764, + "step": 16784 + }, + { + "epoch": 0.26856, + "grad_norm": 0.9609375, + "learning_rate": 7.373548387096775e-05, + "loss": 0.22, + "step": 16785 + }, + { + "epoch": 0.268576, + "grad_norm": 0.703125, + "learning_rate": 7.373387096774193e-05, + "loss": 0.133, + "step": 16786 + }, + { + "epoch": 0.268592, + "grad_norm": 0.80859375, + "learning_rate": 7.373225806451613e-05, + "loss": 0.1659, + "step": 16787 + }, + { + "epoch": 0.268608, + "grad_norm": 1.0703125, + "learning_rate": 7.373064516129032e-05, + "loss": 0.1742, + "step": 16788 + }, + { + "epoch": 0.268624, + "grad_norm": 0.7734375, + "learning_rate": 7.372903225806452e-05, + "loss": 0.1583, + "step": 16789 + }, + { + "epoch": 0.26864, + "grad_norm": 0.63671875, + "learning_rate": 7.372741935483872e-05, + "loss": 0.1736, + "step": 16790 + }, + { + "epoch": 0.268656, + "grad_norm": 0.890625, + "learning_rate": 7.372580645161292e-05, + "loss": 0.1872, + "step": 16791 + }, + { + "epoch": 0.268672, + "grad_norm": 0.58984375, + "learning_rate": 7.37241935483871e-05, + "loss": 0.157, + "step": 16792 + }, + { + "epoch": 0.268688, + "grad_norm": 0.72265625, + "learning_rate": 7.37225806451613e-05, + "loss": 0.1815, + "step": 16793 + }, + { + "epoch": 0.268704, + "grad_norm": 1.484375, + "learning_rate": 7.372096774193549e-05, + "loss": 0.1627, + "step": 16794 + }, + { + "epoch": 0.26872, + "grad_norm": 1.0859375, + "learning_rate": 7.371935483870967e-05, + "loss": 0.2008, + "step": 16795 + }, + { + "epoch": 0.268736, + "grad_norm": 0.71875, + "learning_rate": 7.371774193548387e-05, + "loss": 0.194, + "step": 16796 + }, + { + "epoch": 0.268752, + "grad_norm": 0.92578125, + "learning_rate": 7.371612903225806e-05, + "loss": 0.1708, + "step": 16797 + }, + { + "epoch": 0.268768, + "grad_norm": 0.87890625, + "learning_rate": 7.371451612903226e-05, + "loss": 0.1476, + "step": 16798 + }, + { + "epoch": 0.268784, + "grad_norm": 0.62890625, + "learning_rate": 7.371290322580645e-05, + "loss": 0.1563, + "step": 16799 + }, + { + "epoch": 0.2688, + "grad_norm": 0.91796875, + "learning_rate": 7.371129032258065e-05, + "loss": 0.1921, + "step": 16800 + }, + { + "epoch": 0.268816, + "grad_norm": 0.98828125, + "learning_rate": 7.370967741935485e-05, + "loss": 0.1832, + "step": 16801 + }, + { + "epoch": 0.268832, + "grad_norm": 0.63671875, + "learning_rate": 7.370806451612904e-05, + "loss": 0.1465, + "step": 16802 + }, + { + "epoch": 0.268848, + "grad_norm": 0.9140625, + "learning_rate": 7.370645161290323e-05, + "loss": 0.1609, + "step": 16803 + }, + { + "epoch": 0.268864, + "grad_norm": 0.875, + "learning_rate": 7.370483870967743e-05, + "loss": 0.1545, + "step": 16804 + }, + { + "epoch": 0.26888, + "grad_norm": 0.90234375, + "learning_rate": 7.370322580645162e-05, + "loss": 0.217, + "step": 16805 + }, + { + "epoch": 0.268896, + "grad_norm": 0.64453125, + "learning_rate": 7.370161290322582e-05, + "loss": 0.1628, + "step": 16806 + }, + { + "epoch": 0.268912, + "grad_norm": 0.60546875, + "learning_rate": 7.37e-05, + "loss": 0.1225, + "step": 16807 + }, + { + "epoch": 0.268928, + "grad_norm": 1.25, + "learning_rate": 7.36983870967742e-05, + "loss": 0.1702, + "step": 16808 + }, + { + "epoch": 0.268944, + "grad_norm": 0.58984375, + "learning_rate": 7.369677419354839e-05, + "loss": 0.1681, + "step": 16809 + }, + { + "epoch": 0.26896, + "grad_norm": 0.6640625, + "learning_rate": 7.369516129032259e-05, + "loss": 0.165, + "step": 16810 + }, + { + "epoch": 0.268976, + "grad_norm": 1.09375, + "learning_rate": 7.369354838709677e-05, + "loss": 0.1554, + "step": 16811 + }, + { + "epoch": 0.268992, + "grad_norm": 0.474609375, + "learning_rate": 7.369193548387096e-05, + "loss": 0.1533, + "step": 16812 + }, + { + "epoch": 0.269008, + "grad_norm": 0.69921875, + "learning_rate": 7.369032258064516e-05, + "loss": 0.1678, + "step": 16813 + }, + { + "epoch": 0.269024, + "grad_norm": 0.93359375, + "learning_rate": 7.368870967741936e-05, + "loss": 0.2108, + "step": 16814 + }, + { + "epoch": 0.26904, + "grad_norm": 0.50390625, + "learning_rate": 7.368709677419356e-05, + "loss": 0.1233, + "step": 16815 + }, + { + "epoch": 0.269056, + "grad_norm": 0.84765625, + "learning_rate": 7.368548387096774e-05, + "loss": 0.1609, + "step": 16816 + }, + { + "epoch": 0.269072, + "grad_norm": 0.80859375, + "learning_rate": 7.368387096774194e-05, + "loss": 0.1887, + "step": 16817 + }, + { + "epoch": 0.269088, + "grad_norm": 1.1953125, + "learning_rate": 7.368225806451613e-05, + "loss": 0.2022, + "step": 16818 + }, + { + "epoch": 0.269104, + "grad_norm": 0.640625, + "learning_rate": 7.368064516129033e-05, + "loss": 0.1808, + "step": 16819 + }, + { + "epoch": 0.26912, + "grad_norm": 0.65625, + "learning_rate": 7.367903225806452e-05, + "loss": 0.1927, + "step": 16820 + }, + { + "epoch": 0.269136, + "grad_norm": 0.86328125, + "learning_rate": 7.367741935483872e-05, + "loss": 0.1484, + "step": 16821 + }, + { + "epoch": 0.269152, + "grad_norm": 0.80859375, + "learning_rate": 7.36758064516129e-05, + "loss": 0.16, + "step": 16822 + }, + { + "epoch": 0.269168, + "grad_norm": 0.71875, + "learning_rate": 7.36741935483871e-05, + "loss": 0.1606, + "step": 16823 + }, + { + "epoch": 0.269184, + "grad_norm": 1.3125, + "learning_rate": 7.367258064516129e-05, + "loss": 0.1489, + "step": 16824 + }, + { + "epoch": 0.2692, + "grad_norm": 1.1171875, + "learning_rate": 7.367096774193549e-05, + "loss": 0.1631, + "step": 16825 + }, + { + "epoch": 0.269216, + "grad_norm": 1.34375, + "learning_rate": 7.366935483870969e-05, + "loss": 0.1632, + "step": 16826 + }, + { + "epoch": 0.269232, + "grad_norm": 0.8984375, + "learning_rate": 7.366774193548387e-05, + "loss": 0.1713, + "step": 16827 + }, + { + "epoch": 0.269248, + "grad_norm": 0.72265625, + "learning_rate": 7.366612903225807e-05, + "loss": 0.1581, + "step": 16828 + }, + { + "epoch": 0.269264, + "grad_norm": 0.6796875, + "learning_rate": 7.366451612903226e-05, + "loss": 0.1966, + "step": 16829 + }, + { + "epoch": 0.26928, + "grad_norm": 0.875, + "learning_rate": 7.366290322580646e-05, + "loss": 0.1712, + "step": 16830 + }, + { + "epoch": 0.269296, + "grad_norm": 0.5859375, + "learning_rate": 7.366129032258064e-05, + "loss": 0.1708, + "step": 16831 + }, + { + "epoch": 0.269312, + "grad_norm": 0.8125, + "learning_rate": 7.365967741935484e-05, + "loss": 0.1822, + "step": 16832 + }, + { + "epoch": 0.269328, + "grad_norm": 0.890625, + "learning_rate": 7.365806451612903e-05, + "loss": 0.1871, + "step": 16833 + }, + { + "epoch": 0.269344, + "grad_norm": 1.34375, + "learning_rate": 7.365645161290323e-05, + "loss": 0.1854, + "step": 16834 + }, + { + "epoch": 0.26936, + "grad_norm": 0.75, + "learning_rate": 7.365483870967742e-05, + "loss": 0.1356, + "step": 16835 + }, + { + "epoch": 0.269376, + "grad_norm": 0.78515625, + "learning_rate": 7.365322580645162e-05, + "loss": 0.1544, + "step": 16836 + }, + { + "epoch": 0.269392, + "grad_norm": 0.6015625, + "learning_rate": 7.365161290322582e-05, + "loss": 0.1738, + "step": 16837 + }, + { + "epoch": 0.269408, + "grad_norm": 0.75, + "learning_rate": 7.365e-05, + "loss": 0.1966, + "step": 16838 + }, + { + "epoch": 0.269424, + "grad_norm": 0.80859375, + "learning_rate": 7.36483870967742e-05, + "loss": 0.1869, + "step": 16839 + }, + { + "epoch": 0.26944, + "grad_norm": 0.90625, + "learning_rate": 7.36467741935484e-05, + "loss": 0.1883, + "step": 16840 + }, + { + "epoch": 0.269456, + "grad_norm": 1.03125, + "learning_rate": 7.364516129032259e-05, + "loss": 0.1303, + "step": 16841 + }, + { + "epoch": 0.269472, + "grad_norm": 0.7734375, + "learning_rate": 7.364354838709677e-05, + "loss": 0.194, + "step": 16842 + }, + { + "epoch": 0.269488, + "grad_norm": 0.7421875, + "learning_rate": 7.364193548387097e-05, + "loss": 0.1861, + "step": 16843 + }, + { + "epoch": 0.269504, + "grad_norm": 0.88671875, + "learning_rate": 7.364032258064516e-05, + "loss": 0.1694, + "step": 16844 + }, + { + "epoch": 0.26952, + "grad_norm": 1.1328125, + "learning_rate": 7.363870967741936e-05, + "loss": 0.1624, + "step": 16845 + }, + { + "epoch": 0.269536, + "grad_norm": 0.90625, + "learning_rate": 7.363709677419354e-05, + "loss": 0.1784, + "step": 16846 + }, + { + "epoch": 0.269552, + "grad_norm": 1.2265625, + "learning_rate": 7.363548387096774e-05, + "loss": 0.1231, + "step": 16847 + }, + { + "epoch": 0.269568, + "grad_norm": 0.54296875, + "learning_rate": 7.363387096774193e-05, + "loss": 0.1393, + "step": 16848 + }, + { + "epoch": 0.269584, + "grad_norm": 0.703125, + "learning_rate": 7.363225806451613e-05, + "loss": 0.2085, + "step": 16849 + }, + { + "epoch": 0.2696, + "grad_norm": 0.69140625, + "learning_rate": 7.363064516129033e-05, + "loss": 0.1703, + "step": 16850 + }, + { + "epoch": 0.269616, + "grad_norm": 0.83203125, + "learning_rate": 7.362903225806453e-05, + "loss": 0.1744, + "step": 16851 + }, + { + "epoch": 0.269632, + "grad_norm": 0.91015625, + "learning_rate": 7.362741935483871e-05, + "loss": 0.2112, + "step": 16852 + }, + { + "epoch": 0.269648, + "grad_norm": 0.578125, + "learning_rate": 7.362580645161291e-05, + "loss": 0.1762, + "step": 16853 + }, + { + "epoch": 0.269664, + "grad_norm": 0.54296875, + "learning_rate": 7.36241935483871e-05, + "loss": 0.1659, + "step": 16854 + }, + { + "epoch": 0.26968, + "grad_norm": 0.8515625, + "learning_rate": 7.36225806451613e-05, + "loss": 0.152, + "step": 16855 + }, + { + "epoch": 0.269696, + "grad_norm": 0.6484375, + "learning_rate": 7.362096774193549e-05, + "loss": 0.1735, + "step": 16856 + }, + { + "epoch": 0.269712, + "grad_norm": 0.5390625, + "learning_rate": 7.361935483870967e-05, + "loss": 0.1718, + "step": 16857 + }, + { + "epoch": 0.269728, + "grad_norm": 1.1171875, + "learning_rate": 7.361774193548387e-05, + "loss": 0.2436, + "step": 16858 + }, + { + "epoch": 0.269744, + "grad_norm": 0.89453125, + "learning_rate": 7.361612903225806e-05, + "loss": 0.165, + "step": 16859 + }, + { + "epoch": 0.26976, + "grad_norm": 0.68359375, + "learning_rate": 7.361451612903226e-05, + "loss": 0.1909, + "step": 16860 + }, + { + "epoch": 0.269776, + "grad_norm": 1.0234375, + "learning_rate": 7.361290322580646e-05, + "loss": 0.1849, + "step": 16861 + }, + { + "epoch": 0.269792, + "grad_norm": 1.09375, + "learning_rate": 7.361129032258066e-05, + "loss": 0.198, + "step": 16862 + }, + { + "epoch": 0.269808, + "grad_norm": 0.6796875, + "learning_rate": 7.360967741935484e-05, + "loss": 0.1576, + "step": 16863 + }, + { + "epoch": 0.269824, + "grad_norm": 0.7578125, + "learning_rate": 7.360806451612904e-05, + "loss": 0.1516, + "step": 16864 + }, + { + "epoch": 0.26984, + "grad_norm": 0.68359375, + "learning_rate": 7.360645161290323e-05, + "loss": 0.2225, + "step": 16865 + }, + { + "epoch": 0.269856, + "grad_norm": 0.58203125, + "learning_rate": 7.360483870967743e-05, + "loss": 0.1431, + "step": 16866 + }, + { + "epoch": 0.269872, + "grad_norm": 0.56640625, + "learning_rate": 7.360322580645161e-05, + "loss": 0.1861, + "step": 16867 + }, + { + "epoch": 0.269888, + "grad_norm": 1.1484375, + "learning_rate": 7.360161290322581e-05, + "loss": 0.1659, + "step": 16868 + }, + { + "epoch": 0.269904, + "grad_norm": 0.87890625, + "learning_rate": 7.36e-05, + "loss": 0.188, + "step": 16869 + }, + { + "epoch": 0.26992, + "grad_norm": 1.4765625, + "learning_rate": 7.35983870967742e-05, + "loss": 0.1578, + "step": 16870 + }, + { + "epoch": 0.269936, + "grad_norm": 0.6875, + "learning_rate": 7.359677419354839e-05, + "loss": 0.1541, + "step": 16871 + }, + { + "epoch": 0.269952, + "grad_norm": 0.6875, + "learning_rate": 7.359516129032259e-05, + "loss": 0.1625, + "step": 16872 + }, + { + "epoch": 0.269968, + "grad_norm": 1.109375, + "learning_rate": 7.359354838709677e-05, + "loss": 0.1921, + "step": 16873 + }, + { + "epoch": 0.269984, + "grad_norm": 0.81640625, + "learning_rate": 7.359193548387097e-05, + "loss": 0.1592, + "step": 16874 + }, + { + "epoch": 0.27, + "grad_norm": 0.82421875, + "learning_rate": 7.359032258064517e-05, + "loss": 0.2101, + "step": 16875 + }, + { + "epoch": 0.270016, + "grad_norm": 0.65625, + "learning_rate": 7.358870967741936e-05, + "loss": 0.1418, + "step": 16876 + }, + { + "epoch": 0.270032, + "grad_norm": 0.9375, + "learning_rate": 7.358709677419356e-05, + "loss": 0.192, + "step": 16877 + }, + { + "epoch": 0.270048, + "grad_norm": 1.171875, + "learning_rate": 7.358548387096774e-05, + "loss": 0.2179, + "step": 16878 + }, + { + "epoch": 0.270064, + "grad_norm": 1.296875, + "learning_rate": 7.358387096774194e-05, + "loss": 0.1922, + "step": 16879 + }, + { + "epoch": 0.27008, + "grad_norm": 1.015625, + "learning_rate": 7.358225806451613e-05, + "loss": 0.1409, + "step": 16880 + }, + { + "epoch": 0.270096, + "grad_norm": 0.65234375, + "learning_rate": 7.358064516129033e-05, + "loss": 0.1607, + "step": 16881 + }, + { + "epoch": 0.270112, + "grad_norm": 0.50390625, + "learning_rate": 7.357903225806451e-05, + "loss": 0.1512, + "step": 16882 + }, + { + "epoch": 0.270128, + "grad_norm": 0.66796875, + "learning_rate": 7.357741935483871e-05, + "loss": 0.1717, + "step": 16883 + }, + { + "epoch": 0.270144, + "grad_norm": 0.8515625, + "learning_rate": 7.35758064516129e-05, + "loss": 0.1528, + "step": 16884 + }, + { + "epoch": 0.27016, + "grad_norm": 1.1796875, + "learning_rate": 7.35741935483871e-05, + "loss": 0.1558, + "step": 16885 + }, + { + "epoch": 0.270176, + "grad_norm": 1.515625, + "learning_rate": 7.35725806451613e-05, + "loss": 0.2197, + "step": 16886 + }, + { + "epoch": 0.270192, + "grad_norm": 1.671875, + "learning_rate": 7.35709677419355e-05, + "loss": 0.156, + "step": 16887 + }, + { + "epoch": 0.270208, + "grad_norm": 0.69140625, + "learning_rate": 7.356935483870968e-05, + "loss": 0.1592, + "step": 16888 + }, + { + "epoch": 0.270224, + "grad_norm": 1.015625, + "learning_rate": 7.356774193548387e-05, + "loss": 0.1403, + "step": 16889 + }, + { + "epoch": 0.27024, + "grad_norm": 0.765625, + "learning_rate": 7.356612903225807e-05, + "loss": 0.1736, + "step": 16890 + }, + { + "epoch": 0.270256, + "grad_norm": 0.6953125, + "learning_rate": 7.356451612903226e-05, + "loss": 0.1668, + "step": 16891 + }, + { + "epoch": 0.270272, + "grad_norm": 0.9140625, + "learning_rate": 7.356290322580646e-05, + "loss": 0.1827, + "step": 16892 + }, + { + "epoch": 0.270288, + "grad_norm": 0.6875, + "learning_rate": 7.356129032258064e-05, + "loss": 0.1854, + "step": 16893 + }, + { + "epoch": 0.270304, + "grad_norm": 0.64453125, + "learning_rate": 7.355967741935484e-05, + "loss": 0.1845, + "step": 16894 + }, + { + "epoch": 0.27032, + "grad_norm": 1.2578125, + "learning_rate": 7.355806451612903e-05, + "loss": 0.1937, + "step": 16895 + }, + { + "epoch": 0.270336, + "grad_norm": 1.15625, + "learning_rate": 7.355645161290323e-05, + "loss": 0.1714, + "step": 16896 + }, + { + "epoch": 0.270352, + "grad_norm": 0.97265625, + "learning_rate": 7.355483870967743e-05, + "loss": 0.2303, + "step": 16897 + }, + { + "epoch": 0.270368, + "grad_norm": 1.1484375, + "learning_rate": 7.355322580645163e-05, + "loss": 0.1827, + "step": 16898 + }, + { + "epoch": 0.270384, + "grad_norm": 1.0703125, + "learning_rate": 7.355161290322581e-05, + "loss": 0.1721, + "step": 16899 + }, + { + "epoch": 0.2704, + "grad_norm": 0.70703125, + "learning_rate": 7.355000000000001e-05, + "loss": 0.1746, + "step": 16900 + }, + { + "epoch": 0.270416, + "grad_norm": 1.0625, + "learning_rate": 7.35483870967742e-05, + "loss": 0.2326, + "step": 16901 + }, + { + "epoch": 0.270432, + "grad_norm": 1.0234375, + "learning_rate": 7.35467741935484e-05, + "loss": 0.172, + "step": 16902 + }, + { + "epoch": 0.270448, + "grad_norm": 0.65625, + "learning_rate": 7.354516129032258e-05, + "loss": 0.1767, + "step": 16903 + }, + { + "epoch": 0.270464, + "grad_norm": 0.7265625, + "learning_rate": 7.354354838709677e-05, + "loss": 0.1634, + "step": 16904 + }, + { + "epoch": 0.27048, + "grad_norm": 0.703125, + "learning_rate": 7.354193548387097e-05, + "loss": 0.1712, + "step": 16905 + }, + { + "epoch": 0.270496, + "grad_norm": 1.4921875, + "learning_rate": 7.354032258064516e-05, + "loss": 0.1785, + "step": 16906 + }, + { + "epoch": 0.270512, + "grad_norm": 1.453125, + "learning_rate": 7.353870967741936e-05, + "loss": 0.1599, + "step": 16907 + }, + { + "epoch": 0.270528, + "grad_norm": 0.68359375, + "learning_rate": 7.353709677419354e-05, + "loss": 0.1474, + "step": 16908 + }, + { + "epoch": 0.270544, + "grad_norm": 0.734375, + "learning_rate": 7.353548387096774e-05, + "loss": 0.1465, + "step": 16909 + }, + { + "epoch": 0.27056, + "grad_norm": 0.8125, + "learning_rate": 7.353387096774194e-05, + "loss": 0.1773, + "step": 16910 + }, + { + "epoch": 0.270576, + "grad_norm": 0.671875, + "learning_rate": 7.353225806451614e-05, + "loss": 0.1341, + "step": 16911 + }, + { + "epoch": 0.270592, + "grad_norm": 0.5703125, + "learning_rate": 7.353064516129033e-05, + "loss": 0.1738, + "step": 16912 + }, + { + "epoch": 0.270608, + "grad_norm": 0.7578125, + "learning_rate": 7.352903225806453e-05, + "loss": 0.1458, + "step": 16913 + }, + { + "epoch": 0.270624, + "grad_norm": 0.99609375, + "learning_rate": 7.352741935483871e-05, + "loss": 0.2418, + "step": 16914 + }, + { + "epoch": 0.27064, + "grad_norm": 0.94921875, + "learning_rate": 7.352580645161291e-05, + "loss": 0.1633, + "step": 16915 + }, + { + "epoch": 0.270656, + "grad_norm": 0.7265625, + "learning_rate": 7.35241935483871e-05, + "loss": 0.1528, + "step": 16916 + }, + { + "epoch": 0.270672, + "grad_norm": 1.078125, + "learning_rate": 7.35225806451613e-05, + "loss": 0.2086, + "step": 16917 + }, + { + "epoch": 0.270688, + "grad_norm": 0.8359375, + "learning_rate": 7.352096774193548e-05, + "loss": 0.1675, + "step": 16918 + }, + { + "epoch": 0.270704, + "grad_norm": 1.6484375, + "learning_rate": 7.351935483870968e-05, + "loss": 0.1659, + "step": 16919 + }, + { + "epoch": 0.27072, + "grad_norm": 1.7109375, + "learning_rate": 7.351774193548387e-05, + "loss": 0.1639, + "step": 16920 + }, + { + "epoch": 0.270736, + "grad_norm": 0.49609375, + "learning_rate": 7.351612903225807e-05, + "loss": 0.1619, + "step": 16921 + }, + { + "epoch": 0.270752, + "grad_norm": 1.8828125, + "learning_rate": 7.351451612903227e-05, + "loss": 0.1669, + "step": 16922 + }, + { + "epoch": 0.270768, + "grad_norm": 1.0703125, + "learning_rate": 7.351290322580645e-05, + "loss": 0.1818, + "step": 16923 + }, + { + "epoch": 0.270784, + "grad_norm": 0.64453125, + "learning_rate": 7.351129032258065e-05, + "loss": 0.1535, + "step": 16924 + }, + { + "epoch": 0.2708, + "grad_norm": 0.92578125, + "learning_rate": 7.350967741935484e-05, + "loss": 0.1746, + "step": 16925 + }, + { + "epoch": 0.270816, + "grad_norm": 1.0859375, + "learning_rate": 7.350806451612904e-05, + "loss": 0.1962, + "step": 16926 + }, + { + "epoch": 0.270832, + "grad_norm": 0.87890625, + "learning_rate": 7.350645161290323e-05, + "loss": 0.1532, + "step": 16927 + }, + { + "epoch": 0.270848, + "grad_norm": 0.74609375, + "learning_rate": 7.350483870967743e-05, + "loss": 0.157, + "step": 16928 + }, + { + "epoch": 0.270864, + "grad_norm": 0.87109375, + "learning_rate": 7.350322580645161e-05, + "loss": 0.1721, + "step": 16929 + }, + { + "epoch": 0.27088, + "grad_norm": 0.734375, + "learning_rate": 7.350161290322581e-05, + "loss": 0.1632, + "step": 16930 + }, + { + "epoch": 0.270896, + "grad_norm": 0.5546875, + "learning_rate": 7.35e-05, + "loss": 0.1668, + "step": 16931 + }, + { + "epoch": 0.270912, + "grad_norm": 1.6875, + "learning_rate": 7.34983870967742e-05, + "loss": 0.1711, + "step": 16932 + }, + { + "epoch": 0.270928, + "grad_norm": 0.91015625, + "learning_rate": 7.349677419354838e-05, + "loss": 0.1468, + "step": 16933 + }, + { + "epoch": 0.270944, + "grad_norm": 0.8671875, + "learning_rate": 7.349516129032258e-05, + "loss": 0.2347, + "step": 16934 + }, + { + "epoch": 0.27096, + "grad_norm": 0.59765625, + "learning_rate": 7.349354838709678e-05, + "loss": 0.1473, + "step": 16935 + }, + { + "epoch": 0.270976, + "grad_norm": 0.81640625, + "learning_rate": 7.349193548387097e-05, + "loss": 0.1205, + "step": 16936 + }, + { + "epoch": 0.270992, + "grad_norm": 1.53125, + "learning_rate": 7.349032258064517e-05, + "loss": 0.1779, + "step": 16937 + }, + { + "epoch": 0.271008, + "grad_norm": 0.8125, + "learning_rate": 7.348870967741935e-05, + "loss": 0.1577, + "step": 16938 + }, + { + "epoch": 0.271024, + "grad_norm": 0.734375, + "learning_rate": 7.348709677419355e-05, + "loss": 0.158, + "step": 16939 + }, + { + "epoch": 0.27104, + "grad_norm": 0.68359375, + "learning_rate": 7.348548387096774e-05, + "loss": 0.1904, + "step": 16940 + }, + { + "epoch": 0.271056, + "grad_norm": 0.5546875, + "learning_rate": 7.348387096774194e-05, + "loss": 0.1479, + "step": 16941 + }, + { + "epoch": 0.271072, + "grad_norm": 0.78125, + "learning_rate": 7.348225806451613e-05, + "loss": 0.1942, + "step": 16942 + }, + { + "epoch": 0.271088, + "grad_norm": 1.0546875, + "learning_rate": 7.348064516129033e-05, + "loss": 0.1613, + "step": 16943 + }, + { + "epoch": 0.271104, + "grad_norm": 0.64453125, + "learning_rate": 7.347903225806451e-05, + "loss": 0.1825, + "step": 16944 + }, + { + "epoch": 0.27112, + "grad_norm": 0.6796875, + "learning_rate": 7.347741935483871e-05, + "loss": 0.1837, + "step": 16945 + }, + { + "epoch": 0.271136, + "grad_norm": 0.890625, + "learning_rate": 7.347580645161291e-05, + "loss": 0.1726, + "step": 16946 + }, + { + "epoch": 0.271152, + "grad_norm": 0.86328125, + "learning_rate": 7.347419354838711e-05, + "loss": 0.1714, + "step": 16947 + }, + { + "epoch": 0.271168, + "grad_norm": 0.7265625, + "learning_rate": 7.34725806451613e-05, + "loss": 0.1692, + "step": 16948 + }, + { + "epoch": 0.271184, + "grad_norm": 0.54296875, + "learning_rate": 7.34709677419355e-05, + "loss": 0.1678, + "step": 16949 + }, + { + "epoch": 0.2712, + "grad_norm": 0.7265625, + "learning_rate": 7.346935483870968e-05, + "loss": 0.1809, + "step": 16950 + }, + { + "epoch": 0.271216, + "grad_norm": 0.74609375, + "learning_rate": 7.346774193548387e-05, + "loss": 0.1805, + "step": 16951 + }, + { + "epoch": 0.271232, + "grad_norm": 1.0859375, + "learning_rate": 7.346612903225807e-05, + "loss": 0.2147, + "step": 16952 + }, + { + "epoch": 0.271248, + "grad_norm": 0.69921875, + "learning_rate": 7.346451612903225e-05, + "loss": 0.1568, + "step": 16953 + }, + { + "epoch": 0.271264, + "grad_norm": 0.953125, + "learning_rate": 7.346290322580645e-05, + "loss": 0.2065, + "step": 16954 + }, + { + "epoch": 0.27128, + "grad_norm": 0.75, + "learning_rate": 7.346129032258064e-05, + "loss": 0.1628, + "step": 16955 + }, + { + "epoch": 0.271296, + "grad_norm": 0.80859375, + "learning_rate": 7.345967741935484e-05, + "loss": 0.1914, + "step": 16956 + }, + { + "epoch": 0.271312, + "grad_norm": 0.74609375, + "learning_rate": 7.345806451612904e-05, + "loss": 0.1829, + "step": 16957 + }, + { + "epoch": 0.271328, + "grad_norm": 0.84765625, + "learning_rate": 7.345645161290324e-05, + "loss": 0.213, + "step": 16958 + }, + { + "epoch": 0.271344, + "grad_norm": 1.0078125, + "learning_rate": 7.345483870967742e-05, + "loss": 0.1758, + "step": 16959 + }, + { + "epoch": 0.27136, + "grad_norm": 0.87109375, + "learning_rate": 7.345322580645162e-05, + "loss": 0.1917, + "step": 16960 + }, + { + "epoch": 0.271376, + "grad_norm": 0.86328125, + "learning_rate": 7.345161290322581e-05, + "loss": 0.173, + "step": 16961 + }, + { + "epoch": 0.271392, + "grad_norm": 0.64453125, + "learning_rate": 7.345000000000001e-05, + "loss": 0.1592, + "step": 16962 + }, + { + "epoch": 0.271408, + "grad_norm": 0.66015625, + "learning_rate": 7.34483870967742e-05, + "loss": 0.1773, + "step": 16963 + }, + { + "epoch": 0.271424, + "grad_norm": 0.8125, + "learning_rate": 7.34467741935484e-05, + "loss": 0.1657, + "step": 16964 + }, + { + "epoch": 0.27144, + "grad_norm": 0.72265625, + "learning_rate": 7.344516129032258e-05, + "loss": 0.1771, + "step": 16965 + }, + { + "epoch": 0.271456, + "grad_norm": 0.91796875, + "learning_rate": 7.344354838709677e-05, + "loss": 0.1953, + "step": 16966 + }, + { + "epoch": 0.271472, + "grad_norm": 0.6015625, + "learning_rate": 7.344193548387097e-05, + "loss": 0.1807, + "step": 16967 + }, + { + "epoch": 0.271488, + "grad_norm": 0.89453125, + "learning_rate": 7.344032258064515e-05, + "loss": 0.2006, + "step": 16968 + }, + { + "epoch": 0.271504, + "grad_norm": 0.72265625, + "learning_rate": 7.343870967741935e-05, + "loss": 0.1273, + "step": 16969 + }, + { + "epoch": 0.27152, + "grad_norm": 0.7421875, + "learning_rate": 7.343709677419355e-05, + "loss": 0.2197, + "step": 16970 + }, + { + "epoch": 0.271536, + "grad_norm": 1.65625, + "learning_rate": 7.343548387096775e-05, + "loss": 0.1526, + "step": 16971 + }, + { + "epoch": 0.271552, + "grad_norm": 0.48046875, + "learning_rate": 7.343387096774194e-05, + "loss": 0.1491, + "step": 16972 + }, + { + "epoch": 0.271568, + "grad_norm": 0.7265625, + "learning_rate": 7.343225806451614e-05, + "loss": 0.1617, + "step": 16973 + }, + { + "epoch": 0.271584, + "grad_norm": 0.79296875, + "learning_rate": 7.343064516129032e-05, + "loss": 0.1258, + "step": 16974 + }, + { + "epoch": 0.2716, + "grad_norm": 1.109375, + "learning_rate": 7.342903225806452e-05, + "loss": 0.1765, + "step": 16975 + }, + { + "epoch": 0.271616, + "grad_norm": 0.6875, + "learning_rate": 7.342741935483871e-05, + "loss": 0.1638, + "step": 16976 + }, + { + "epoch": 0.271632, + "grad_norm": 1.4765625, + "learning_rate": 7.342580645161291e-05, + "loss": 0.1723, + "step": 16977 + }, + { + "epoch": 0.271648, + "grad_norm": 1.046875, + "learning_rate": 7.34241935483871e-05, + "loss": 0.1588, + "step": 16978 + }, + { + "epoch": 0.271664, + "grad_norm": 0.88671875, + "learning_rate": 7.34225806451613e-05, + "loss": 0.179, + "step": 16979 + }, + { + "epoch": 0.27168, + "grad_norm": 0.71875, + "learning_rate": 7.342096774193548e-05, + "loss": 0.1562, + "step": 16980 + }, + { + "epoch": 0.271696, + "grad_norm": 0.73828125, + "learning_rate": 7.341935483870968e-05, + "loss": 0.1643, + "step": 16981 + }, + { + "epoch": 0.271712, + "grad_norm": 0.75, + "learning_rate": 7.341774193548388e-05, + "loss": 0.1697, + "step": 16982 + }, + { + "epoch": 0.271728, + "grad_norm": 0.734375, + "learning_rate": 7.341612903225807e-05, + "loss": 0.1852, + "step": 16983 + }, + { + "epoch": 0.271744, + "grad_norm": 0.65234375, + "learning_rate": 7.341451612903227e-05, + "loss": 0.1328, + "step": 16984 + }, + { + "epoch": 0.27176, + "grad_norm": 0.62890625, + "learning_rate": 7.341290322580645e-05, + "loss": 0.1395, + "step": 16985 + }, + { + "epoch": 0.271776, + "grad_norm": 0.69140625, + "learning_rate": 7.341129032258065e-05, + "loss": 0.1588, + "step": 16986 + }, + { + "epoch": 0.271792, + "grad_norm": 0.796875, + "learning_rate": 7.340967741935484e-05, + "loss": 0.1809, + "step": 16987 + }, + { + "epoch": 0.271808, + "grad_norm": 1.0625, + "learning_rate": 7.340806451612904e-05, + "loss": 0.1614, + "step": 16988 + }, + { + "epoch": 0.271824, + "grad_norm": 0.76171875, + "learning_rate": 7.340645161290322e-05, + "loss": 0.1778, + "step": 16989 + }, + { + "epoch": 0.27184, + "grad_norm": 0.63671875, + "learning_rate": 7.340483870967742e-05, + "loss": 0.1933, + "step": 16990 + }, + { + "epoch": 0.271856, + "grad_norm": 1.03125, + "learning_rate": 7.340322580645161e-05, + "loss": 0.1965, + "step": 16991 + }, + { + "epoch": 0.271872, + "grad_norm": 0.84375, + "learning_rate": 7.340161290322581e-05, + "loss": 0.1634, + "step": 16992 + }, + { + "epoch": 0.271888, + "grad_norm": 0.4765625, + "learning_rate": 7.340000000000001e-05, + "loss": 0.1257, + "step": 16993 + }, + { + "epoch": 0.271904, + "grad_norm": 0.65625, + "learning_rate": 7.33983870967742e-05, + "loss": 0.1358, + "step": 16994 + }, + { + "epoch": 0.27192, + "grad_norm": 0.62890625, + "learning_rate": 7.33967741935484e-05, + "loss": 0.1747, + "step": 16995 + }, + { + "epoch": 0.271936, + "grad_norm": 0.6796875, + "learning_rate": 7.33951612903226e-05, + "loss": 0.1624, + "step": 16996 + }, + { + "epoch": 0.271952, + "grad_norm": 0.9296875, + "learning_rate": 7.339354838709678e-05, + "loss": 0.1743, + "step": 16997 + }, + { + "epoch": 0.271968, + "grad_norm": 0.9609375, + "learning_rate": 7.339193548387097e-05, + "loss": 0.1789, + "step": 16998 + }, + { + "epoch": 0.271984, + "grad_norm": 0.51953125, + "learning_rate": 7.339032258064517e-05, + "loss": 0.1309, + "step": 16999 + }, + { + "epoch": 0.272, + "grad_norm": 0.68359375, + "learning_rate": 7.338870967741935e-05, + "loss": 0.1512, + "step": 17000 + }, + { + "epoch": 0.272016, + "grad_norm": 0.77734375, + "learning_rate": 7.338709677419355e-05, + "loss": 0.1784, + "step": 17001 + }, + { + "epoch": 0.272032, + "grad_norm": 0.6640625, + "learning_rate": 7.338548387096774e-05, + "loss": 0.1918, + "step": 17002 + }, + { + "epoch": 0.272048, + "grad_norm": 0.734375, + "learning_rate": 7.338387096774194e-05, + "loss": 0.1812, + "step": 17003 + }, + { + "epoch": 0.272064, + "grad_norm": 0.94140625, + "learning_rate": 7.338225806451612e-05, + "loss": 0.1574, + "step": 17004 + }, + { + "epoch": 0.27208, + "grad_norm": 0.85546875, + "learning_rate": 7.338064516129032e-05, + "loss": 0.1557, + "step": 17005 + }, + { + "epoch": 0.272096, + "grad_norm": 1.09375, + "learning_rate": 7.337903225806452e-05, + "loss": 0.1686, + "step": 17006 + }, + { + "epoch": 0.272112, + "grad_norm": 0.70703125, + "learning_rate": 7.337741935483872e-05, + "loss": 0.1502, + "step": 17007 + }, + { + "epoch": 0.272128, + "grad_norm": 0.62109375, + "learning_rate": 7.337580645161291e-05, + "loss": 0.1571, + "step": 17008 + }, + { + "epoch": 0.272144, + "grad_norm": 0.640625, + "learning_rate": 7.337419354838711e-05, + "loss": 0.189, + "step": 17009 + }, + { + "epoch": 0.27216, + "grad_norm": 0.671875, + "learning_rate": 7.33725806451613e-05, + "loss": 0.137, + "step": 17010 + }, + { + "epoch": 0.272176, + "grad_norm": 0.65625, + "learning_rate": 7.33709677419355e-05, + "loss": 0.1912, + "step": 17011 + }, + { + "epoch": 0.272192, + "grad_norm": 0.57421875, + "learning_rate": 7.336935483870968e-05, + "loss": 0.1614, + "step": 17012 + }, + { + "epoch": 0.272208, + "grad_norm": 0.90625, + "learning_rate": 7.336774193548387e-05, + "loss": 0.145, + "step": 17013 + }, + { + "epoch": 0.272224, + "grad_norm": 0.94140625, + "learning_rate": 7.336612903225807e-05, + "loss": 0.1634, + "step": 17014 + }, + { + "epoch": 0.27224, + "grad_norm": 0.83984375, + "learning_rate": 7.336451612903225e-05, + "loss": 0.173, + "step": 17015 + }, + { + "epoch": 0.272256, + "grad_norm": 0.97265625, + "learning_rate": 7.336290322580645e-05, + "loss": 0.1775, + "step": 17016 + }, + { + "epoch": 0.272272, + "grad_norm": 0.62890625, + "learning_rate": 7.336129032258065e-05, + "loss": 0.1695, + "step": 17017 + }, + { + "epoch": 0.272288, + "grad_norm": 0.796875, + "learning_rate": 7.335967741935485e-05, + "loss": 0.1729, + "step": 17018 + }, + { + "epoch": 0.272304, + "grad_norm": 1.078125, + "learning_rate": 7.335806451612904e-05, + "loss": 0.1724, + "step": 17019 + }, + { + "epoch": 0.27232, + "grad_norm": 0.765625, + "learning_rate": 7.335645161290324e-05, + "loss": 0.1673, + "step": 17020 + }, + { + "epoch": 0.272336, + "grad_norm": 0.703125, + "learning_rate": 7.335483870967742e-05, + "loss": 0.1469, + "step": 17021 + }, + { + "epoch": 0.272352, + "grad_norm": 0.67578125, + "learning_rate": 7.335322580645162e-05, + "loss": 0.1543, + "step": 17022 + }, + { + "epoch": 0.272368, + "grad_norm": 0.63671875, + "learning_rate": 7.335161290322581e-05, + "loss": 0.1828, + "step": 17023 + }, + { + "epoch": 0.272384, + "grad_norm": 0.8203125, + "learning_rate": 7.335000000000001e-05, + "loss": 0.1579, + "step": 17024 + }, + { + "epoch": 0.2724, + "grad_norm": 0.9140625, + "learning_rate": 7.33483870967742e-05, + "loss": 0.1588, + "step": 17025 + }, + { + "epoch": 0.272416, + "grad_norm": 0.73046875, + "learning_rate": 7.33467741935484e-05, + "loss": 0.1589, + "step": 17026 + }, + { + "epoch": 0.272432, + "grad_norm": 1.03125, + "learning_rate": 7.334516129032258e-05, + "loss": 0.1749, + "step": 17027 + }, + { + "epoch": 0.272448, + "grad_norm": 1.046875, + "learning_rate": 7.334354838709678e-05, + "loss": 0.1633, + "step": 17028 + }, + { + "epoch": 0.272464, + "grad_norm": 0.79296875, + "learning_rate": 7.334193548387097e-05, + "loss": 0.1341, + "step": 17029 + }, + { + "epoch": 0.27248, + "grad_norm": 0.69140625, + "learning_rate": 7.334032258064516e-05, + "loss": 0.1769, + "step": 17030 + }, + { + "epoch": 0.272496, + "grad_norm": 0.71875, + "learning_rate": 7.333870967741936e-05, + "loss": 0.1416, + "step": 17031 + }, + { + "epoch": 0.272512, + "grad_norm": 0.91796875, + "learning_rate": 7.333709677419355e-05, + "loss": 0.1941, + "step": 17032 + }, + { + "epoch": 0.272528, + "grad_norm": 1.2109375, + "learning_rate": 7.333548387096775e-05, + "loss": 0.1583, + "step": 17033 + }, + { + "epoch": 0.272544, + "grad_norm": 0.76953125, + "learning_rate": 7.333387096774194e-05, + "loss": 0.178, + "step": 17034 + }, + { + "epoch": 0.27256, + "grad_norm": 0.88671875, + "learning_rate": 7.333225806451614e-05, + "loss": 0.1788, + "step": 17035 + }, + { + "epoch": 0.272576, + "grad_norm": 0.76953125, + "learning_rate": 7.333064516129032e-05, + "loss": 0.1853, + "step": 17036 + }, + { + "epoch": 0.272592, + "grad_norm": 0.76171875, + "learning_rate": 7.332903225806452e-05, + "loss": 0.1681, + "step": 17037 + }, + { + "epoch": 0.272608, + "grad_norm": 0.76171875, + "learning_rate": 7.332741935483871e-05, + "loss": 0.1855, + "step": 17038 + }, + { + "epoch": 0.272624, + "grad_norm": 0.70703125, + "learning_rate": 7.332580645161291e-05, + "loss": 0.2056, + "step": 17039 + }, + { + "epoch": 0.27264, + "grad_norm": 0.6328125, + "learning_rate": 7.332419354838709e-05, + "loss": 0.1559, + "step": 17040 + }, + { + "epoch": 0.272656, + "grad_norm": 0.6875, + "learning_rate": 7.332258064516129e-05, + "loss": 0.1669, + "step": 17041 + }, + { + "epoch": 0.272672, + "grad_norm": 0.6875, + "learning_rate": 7.332096774193549e-05, + "loss": 0.1689, + "step": 17042 + }, + { + "epoch": 0.272688, + "grad_norm": 0.625, + "learning_rate": 7.331935483870969e-05, + "loss": 0.1928, + "step": 17043 + }, + { + "epoch": 0.272704, + "grad_norm": 0.8046875, + "learning_rate": 7.331774193548388e-05, + "loss": 0.1795, + "step": 17044 + }, + { + "epoch": 0.27272, + "grad_norm": 0.5859375, + "learning_rate": 7.331612903225806e-05, + "loss": 0.1579, + "step": 17045 + }, + { + "epoch": 0.272736, + "grad_norm": 0.61328125, + "learning_rate": 7.331451612903226e-05, + "loss": 0.1647, + "step": 17046 + }, + { + "epoch": 0.272752, + "grad_norm": 0.59375, + "learning_rate": 7.331290322580645e-05, + "loss": 0.1928, + "step": 17047 + }, + { + "epoch": 0.272768, + "grad_norm": 0.85546875, + "learning_rate": 7.331129032258065e-05, + "loss": 0.2484, + "step": 17048 + }, + { + "epoch": 0.272784, + "grad_norm": 0.9609375, + "learning_rate": 7.330967741935484e-05, + "loss": 0.129, + "step": 17049 + }, + { + "epoch": 0.2728, + "grad_norm": 1.3046875, + "learning_rate": 7.330806451612904e-05, + "loss": 0.1653, + "step": 17050 + }, + { + "epoch": 0.272816, + "grad_norm": 0.734375, + "learning_rate": 7.330645161290322e-05, + "loss": 0.2099, + "step": 17051 + }, + { + "epoch": 0.272832, + "grad_norm": 1.03125, + "learning_rate": 7.330483870967742e-05, + "loss": 0.1903, + "step": 17052 + }, + { + "epoch": 0.272848, + "grad_norm": 0.61328125, + "learning_rate": 7.330322580645162e-05, + "loss": 0.1498, + "step": 17053 + }, + { + "epoch": 0.272864, + "grad_norm": 0.8671875, + "learning_rate": 7.330161290322582e-05, + "loss": 0.1252, + "step": 17054 + }, + { + "epoch": 0.27288, + "grad_norm": 0.640625, + "learning_rate": 7.33e-05, + "loss": 0.1669, + "step": 17055 + }, + { + "epoch": 0.272896, + "grad_norm": 0.98046875, + "learning_rate": 7.32983870967742e-05, + "loss": 0.1788, + "step": 17056 + }, + { + "epoch": 0.272912, + "grad_norm": 0.578125, + "learning_rate": 7.329677419354839e-05, + "loss": 0.158, + "step": 17057 + }, + { + "epoch": 0.272928, + "grad_norm": 0.90625, + "learning_rate": 7.329516129032259e-05, + "loss": 0.2003, + "step": 17058 + }, + { + "epoch": 0.272944, + "grad_norm": 0.796875, + "learning_rate": 7.329354838709678e-05, + "loss": 0.181, + "step": 17059 + }, + { + "epoch": 0.27296, + "grad_norm": 1.5546875, + "learning_rate": 7.329193548387096e-05, + "loss": 0.2012, + "step": 17060 + }, + { + "epoch": 0.272976, + "grad_norm": 1.6328125, + "learning_rate": 7.329032258064516e-05, + "loss": 0.1846, + "step": 17061 + }, + { + "epoch": 0.272992, + "grad_norm": 1.421875, + "learning_rate": 7.328870967741935e-05, + "loss": 0.1796, + "step": 17062 + }, + { + "epoch": 0.273008, + "grad_norm": 0.85546875, + "learning_rate": 7.328709677419355e-05, + "loss": 0.1642, + "step": 17063 + }, + { + "epoch": 0.273024, + "grad_norm": 1.140625, + "learning_rate": 7.328548387096774e-05, + "loss": 0.2015, + "step": 17064 + }, + { + "epoch": 0.27304, + "grad_norm": 1.1640625, + "learning_rate": 7.328387096774193e-05, + "loss": 0.1401, + "step": 17065 + }, + { + "epoch": 0.273056, + "grad_norm": 0.58984375, + "learning_rate": 7.328225806451613e-05, + "loss": 0.1852, + "step": 17066 + }, + { + "epoch": 0.273072, + "grad_norm": 0.63671875, + "learning_rate": 7.328064516129033e-05, + "loss": 0.1657, + "step": 17067 + }, + { + "epoch": 0.273088, + "grad_norm": 0.66015625, + "learning_rate": 7.327903225806452e-05, + "loss": 0.1525, + "step": 17068 + }, + { + "epoch": 0.273104, + "grad_norm": 0.6796875, + "learning_rate": 7.327741935483872e-05, + "loss": 0.1809, + "step": 17069 + }, + { + "epoch": 0.27312, + "grad_norm": 1.2890625, + "learning_rate": 7.32758064516129e-05, + "loss": 0.1678, + "step": 17070 + }, + { + "epoch": 0.273136, + "grad_norm": 0.64453125, + "learning_rate": 7.32741935483871e-05, + "loss": 0.1465, + "step": 17071 + }, + { + "epoch": 0.273152, + "grad_norm": 0.87890625, + "learning_rate": 7.327258064516129e-05, + "loss": 0.1508, + "step": 17072 + }, + { + "epoch": 0.273168, + "grad_norm": 0.60546875, + "learning_rate": 7.327096774193549e-05, + "loss": 0.1741, + "step": 17073 + }, + { + "epoch": 0.273184, + "grad_norm": 0.80078125, + "learning_rate": 7.326935483870968e-05, + "loss": 0.1832, + "step": 17074 + }, + { + "epoch": 0.2732, + "grad_norm": 0.6015625, + "learning_rate": 7.326774193548386e-05, + "loss": 0.1611, + "step": 17075 + }, + { + "epoch": 0.273216, + "grad_norm": 0.85546875, + "learning_rate": 7.326612903225806e-05, + "loss": 0.1879, + "step": 17076 + }, + { + "epoch": 0.273232, + "grad_norm": 1.2734375, + "learning_rate": 7.326451612903226e-05, + "loss": 0.1634, + "step": 17077 + }, + { + "epoch": 0.273248, + "grad_norm": 0.98046875, + "learning_rate": 7.326290322580646e-05, + "loss": 0.1883, + "step": 17078 + }, + { + "epoch": 0.273264, + "grad_norm": 1.0, + "learning_rate": 7.326129032258065e-05, + "loss": 0.1631, + "step": 17079 + }, + { + "epoch": 0.27328, + "grad_norm": 0.5859375, + "learning_rate": 7.325967741935485e-05, + "loss": 0.1647, + "step": 17080 + }, + { + "epoch": 0.273296, + "grad_norm": 0.828125, + "learning_rate": 7.325806451612903e-05, + "loss": 0.1853, + "step": 17081 + }, + { + "epoch": 0.273312, + "grad_norm": 0.88671875, + "learning_rate": 7.325645161290323e-05, + "loss": 0.17, + "step": 17082 + }, + { + "epoch": 0.273328, + "grad_norm": 0.8671875, + "learning_rate": 7.325483870967742e-05, + "loss": 0.1576, + "step": 17083 + }, + { + "epoch": 0.273344, + "grad_norm": 0.55078125, + "learning_rate": 7.325322580645162e-05, + "loss": 0.1747, + "step": 17084 + }, + { + "epoch": 0.27336, + "grad_norm": 0.68359375, + "learning_rate": 7.32516129032258e-05, + "loss": 0.1674, + "step": 17085 + }, + { + "epoch": 0.273376, + "grad_norm": 0.66796875, + "learning_rate": 7.325e-05, + "loss": 0.1877, + "step": 17086 + }, + { + "epoch": 0.273392, + "grad_norm": 0.91015625, + "learning_rate": 7.324838709677419e-05, + "loss": 0.1509, + "step": 17087 + }, + { + "epoch": 0.273408, + "grad_norm": 0.99609375, + "learning_rate": 7.324677419354839e-05, + "loss": 0.1651, + "step": 17088 + }, + { + "epoch": 0.273424, + "grad_norm": 0.6796875, + "learning_rate": 7.324516129032259e-05, + "loss": 0.1566, + "step": 17089 + }, + { + "epoch": 0.27344, + "grad_norm": 0.98828125, + "learning_rate": 7.324354838709678e-05, + "loss": 0.1374, + "step": 17090 + }, + { + "epoch": 0.273456, + "grad_norm": 0.6015625, + "learning_rate": 7.324193548387098e-05, + "loss": 0.1574, + "step": 17091 + }, + { + "epoch": 0.273472, + "grad_norm": 1.296875, + "learning_rate": 7.324032258064516e-05, + "loss": 0.2059, + "step": 17092 + }, + { + "epoch": 0.273488, + "grad_norm": 0.8671875, + "learning_rate": 7.323870967741936e-05, + "loss": 0.1796, + "step": 17093 + }, + { + "epoch": 0.273504, + "grad_norm": 0.5703125, + "learning_rate": 7.323709677419355e-05, + "loss": 0.1674, + "step": 17094 + }, + { + "epoch": 0.27352, + "grad_norm": 0.72265625, + "learning_rate": 7.323548387096775e-05, + "loss": 0.1692, + "step": 17095 + }, + { + "epoch": 0.273536, + "grad_norm": 0.53125, + "learning_rate": 7.323387096774193e-05, + "loss": 0.166, + "step": 17096 + }, + { + "epoch": 0.273552, + "grad_norm": 1.2578125, + "learning_rate": 7.323225806451613e-05, + "loss": 0.1718, + "step": 17097 + }, + { + "epoch": 0.273568, + "grad_norm": 1.3046875, + "learning_rate": 7.323064516129032e-05, + "loss": 0.1876, + "step": 17098 + }, + { + "epoch": 0.273584, + "grad_norm": 0.6796875, + "learning_rate": 7.322903225806452e-05, + "loss": 0.1593, + "step": 17099 + }, + { + "epoch": 0.2736, + "grad_norm": 0.78125, + "learning_rate": 7.32274193548387e-05, + "loss": 0.1368, + "step": 17100 + }, + { + "epoch": 0.273616, + "grad_norm": 0.75, + "learning_rate": 7.32258064516129e-05, + "loss": 0.1649, + "step": 17101 + }, + { + "epoch": 0.273632, + "grad_norm": 0.8984375, + "learning_rate": 7.32241935483871e-05, + "loss": 0.1715, + "step": 17102 + }, + { + "epoch": 0.273648, + "grad_norm": 0.88671875, + "learning_rate": 7.32225806451613e-05, + "loss": 0.1668, + "step": 17103 + }, + { + "epoch": 0.273664, + "grad_norm": 0.7421875, + "learning_rate": 7.322096774193549e-05, + "loss": 0.1523, + "step": 17104 + }, + { + "epoch": 0.27368, + "grad_norm": 0.83203125, + "learning_rate": 7.321935483870969e-05, + "loss": 0.1577, + "step": 17105 + }, + { + "epoch": 0.273696, + "grad_norm": 0.76953125, + "learning_rate": 7.321774193548388e-05, + "loss": 0.158, + "step": 17106 + }, + { + "epoch": 0.273712, + "grad_norm": 0.73046875, + "learning_rate": 7.321612903225806e-05, + "loss": 0.1545, + "step": 17107 + }, + { + "epoch": 0.273728, + "grad_norm": 0.7578125, + "learning_rate": 7.321451612903226e-05, + "loss": 0.1705, + "step": 17108 + }, + { + "epoch": 0.273744, + "grad_norm": 0.75, + "learning_rate": 7.321290322580645e-05, + "loss": 0.1864, + "step": 17109 + }, + { + "epoch": 0.27376, + "grad_norm": 0.54296875, + "learning_rate": 7.321129032258065e-05, + "loss": 0.1454, + "step": 17110 + }, + { + "epoch": 0.273776, + "grad_norm": 0.5234375, + "learning_rate": 7.320967741935483e-05, + "loss": 0.142, + "step": 17111 + }, + { + "epoch": 0.273792, + "grad_norm": 0.8984375, + "learning_rate": 7.320806451612903e-05, + "loss": 0.1859, + "step": 17112 + }, + { + "epoch": 0.273808, + "grad_norm": 0.59375, + "learning_rate": 7.320645161290323e-05, + "loss": 0.14, + "step": 17113 + }, + { + "epoch": 0.273824, + "grad_norm": 0.984375, + "learning_rate": 7.320483870967743e-05, + "loss": 0.1867, + "step": 17114 + }, + { + "epoch": 0.27384, + "grad_norm": 1.34375, + "learning_rate": 7.320322580645162e-05, + "loss": 0.157, + "step": 17115 + }, + { + "epoch": 0.273856, + "grad_norm": 1.0625, + "learning_rate": 7.320161290322582e-05, + "loss": 0.2096, + "step": 17116 + }, + { + "epoch": 0.273872, + "grad_norm": 0.59375, + "learning_rate": 7.32e-05, + "loss": 0.1555, + "step": 17117 + }, + { + "epoch": 0.273888, + "grad_norm": 0.81640625, + "learning_rate": 7.31983870967742e-05, + "loss": 0.1727, + "step": 17118 + }, + { + "epoch": 0.273904, + "grad_norm": 0.6015625, + "learning_rate": 7.319677419354839e-05, + "loss": 0.1371, + "step": 17119 + }, + { + "epoch": 0.27392, + "grad_norm": 0.828125, + "learning_rate": 7.319516129032259e-05, + "loss": 0.1543, + "step": 17120 + }, + { + "epoch": 0.273936, + "grad_norm": 0.75, + "learning_rate": 7.319354838709678e-05, + "loss": 0.1863, + "step": 17121 + }, + { + "epoch": 0.273952, + "grad_norm": 0.9375, + "learning_rate": 7.319193548387096e-05, + "loss": 0.1695, + "step": 17122 + }, + { + "epoch": 0.273968, + "grad_norm": 0.72265625, + "learning_rate": 7.319032258064516e-05, + "loss": 0.1708, + "step": 17123 + }, + { + "epoch": 0.273984, + "grad_norm": 0.734375, + "learning_rate": 7.318870967741935e-05, + "loss": 0.1948, + "step": 17124 + }, + { + "epoch": 0.274, + "grad_norm": 0.734375, + "learning_rate": 7.318709677419355e-05, + "loss": 0.1582, + "step": 17125 + }, + { + "epoch": 0.274016, + "grad_norm": 0.8125, + "learning_rate": 7.318548387096775e-05, + "loss": 0.144, + "step": 17126 + }, + { + "epoch": 0.274032, + "grad_norm": 0.875, + "learning_rate": 7.318387096774195e-05, + "loss": 0.1687, + "step": 17127 + }, + { + "epoch": 0.274048, + "grad_norm": 0.5703125, + "learning_rate": 7.318225806451613e-05, + "loss": 0.1417, + "step": 17128 + }, + { + "epoch": 0.274064, + "grad_norm": 0.6875, + "learning_rate": 7.318064516129033e-05, + "loss": 0.1557, + "step": 17129 + }, + { + "epoch": 0.27408, + "grad_norm": 0.7109375, + "learning_rate": 7.317903225806452e-05, + "loss": 0.1871, + "step": 17130 + }, + { + "epoch": 0.274096, + "grad_norm": 0.72265625, + "learning_rate": 7.317741935483872e-05, + "loss": 0.1619, + "step": 17131 + }, + { + "epoch": 0.274112, + "grad_norm": 0.7109375, + "learning_rate": 7.31758064516129e-05, + "loss": 0.1865, + "step": 17132 + }, + { + "epoch": 0.274128, + "grad_norm": 0.77734375, + "learning_rate": 7.31741935483871e-05, + "loss": 0.1148, + "step": 17133 + }, + { + "epoch": 0.274144, + "grad_norm": 0.7265625, + "learning_rate": 7.317258064516129e-05, + "loss": 0.1614, + "step": 17134 + }, + { + "epoch": 0.27416, + "grad_norm": 0.55859375, + "learning_rate": 7.317096774193549e-05, + "loss": 0.1398, + "step": 17135 + }, + { + "epoch": 0.274176, + "grad_norm": 0.79296875, + "learning_rate": 7.316935483870967e-05, + "loss": 0.1975, + "step": 17136 + }, + { + "epoch": 0.274192, + "grad_norm": 1.109375, + "learning_rate": 7.316774193548387e-05, + "loss": 0.1958, + "step": 17137 + }, + { + "epoch": 0.274208, + "grad_norm": 0.75, + "learning_rate": 7.316612903225807e-05, + "loss": 0.1891, + "step": 17138 + }, + { + "epoch": 0.274224, + "grad_norm": 0.75, + "learning_rate": 7.316451612903226e-05, + "loss": 0.1441, + "step": 17139 + }, + { + "epoch": 0.27424, + "grad_norm": 1.296875, + "learning_rate": 7.316290322580646e-05, + "loss": 0.1965, + "step": 17140 + }, + { + "epoch": 0.274256, + "grad_norm": 0.74609375, + "learning_rate": 7.316129032258065e-05, + "loss": 0.1983, + "step": 17141 + }, + { + "epoch": 0.274272, + "grad_norm": 0.96875, + "learning_rate": 7.315967741935485e-05, + "loss": 0.198, + "step": 17142 + }, + { + "epoch": 0.274288, + "grad_norm": 0.72265625, + "learning_rate": 7.315806451612903e-05, + "loss": 0.1502, + "step": 17143 + }, + { + "epoch": 0.274304, + "grad_norm": 0.98046875, + "learning_rate": 7.315645161290323e-05, + "loss": 0.1642, + "step": 17144 + }, + { + "epoch": 0.27432, + "grad_norm": 0.78515625, + "learning_rate": 7.315483870967742e-05, + "loss": 0.1714, + "step": 17145 + }, + { + "epoch": 0.274336, + "grad_norm": 0.80859375, + "learning_rate": 7.315322580645162e-05, + "loss": 0.1831, + "step": 17146 + }, + { + "epoch": 0.274352, + "grad_norm": 0.71484375, + "learning_rate": 7.31516129032258e-05, + "loss": 0.1606, + "step": 17147 + }, + { + "epoch": 0.274368, + "grad_norm": 1.3828125, + "learning_rate": 7.315e-05, + "loss": 0.2123, + "step": 17148 + }, + { + "epoch": 0.274384, + "grad_norm": 0.5703125, + "learning_rate": 7.31483870967742e-05, + "loss": 0.1789, + "step": 17149 + }, + { + "epoch": 0.2744, + "grad_norm": 1.0078125, + "learning_rate": 7.31467741935484e-05, + "loss": 0.2095, + "step": 17150 + }, + { + "epoch": 0.274416, + "grad_norm": 0.80078125, + "learning_rate": 7.314516129032259e-05, + "loss": 0.1614, + "step": 17151 + }, + { + "epoch": 0.274432, + "grad_norm": 0.921875, + "learning_rate": 7.314354838709679e-05, + "loss": 0.1511, + "step": 17152 + }, + { + "epoch": 0.274448, + "grad_norm": 1.078125, + "learning_rate": 7.314193548387097e-05, + "loss": 0.1944, + "step": 17153 + }, + { + "epoch": 0.274464, + "grad_norm": 0.65234375, + "learning_rate": 7.314032258064516e-05, + "loss": 0.1385, + "step": 17154 + }, + { + "epoch": 0.27448, + "grad_norm": 0.91796875, + "learning_rate": 7.313870967741936e-05, + "loss": 0.1659, + "step": 17155 + }, + { + "epoch": 0.274496, + "grad_norm": 0.71484375, + "learning_rate": 7.313709677419355e-05, + "loss": 0.178, + "step": 17156 + }, + { + "epoch": 0.274512, + "grad_norm": 1.015625, + "learning_rate": 7.313548387096775e-05, + "loss": 0.1761, + "step": 17157 + }, + { + "epoch": 0.274528, + "grad_norm": 0.62890625, + "learning_rate": 7.313387096774193e-05, + "loss": 0.176, + "step": 17158 + }, + { + "epoch": 0.274544, + "grad_norm": 0.68359375, + "learning_rate": 7.313225806451613e-05, + "loss": 0.1691, + "step": 17159 + }, + { + "epoch": 0.27456, + "grad_norm": 1.4296875, + "learning_rate": 7.313064516129032e-05, + "loss": 0.1655, + "step": 17160 + }, + { + "epoch": 0.274576, + "grad_norm": 0.71484375, + "learning_rate": 7.312903225806452e-05, + "loss": 0.1588, + "step": 17161 + }, + { + "epoch": 0.274592, + "grad_norm": 0.59375, + "learning_rate": 7.312741935483872e-05, + "loss": 0.1675, + "step": 17162 + }, + { + "epoch": 0.274608, + "grad_norm": 0.98046875, + "learning_rate": 7.312580645161292e-05, + "loss": 0.1818, + "step": 17163 + }, + { + "epoch": 0.274624, + "grad_norm": 0.93359375, + "learning_rate": 7.31241935483871e-05, + "loss": 0.176, + "step": 17164 + }, + { + "epoch": 0.27464, + "grad_norm": 0.69921875, + "learning_rate": 7.31225806451613e-05, + "loss": 0.189, + "step": 17165 + }, + { + "epoch": 0.274656, + "grad_norm": 0.97265625, + "learning_rate": 7.312096774193549e-05, + "loss": 0.1548, + "step": 17166 + }, + { + "epoch": 0.274672, + "grad_norm": 0.80859375, + "learning_rate": 7.311935483870969e-05, + "loss": 0.1487, + "step": 17167 + }, + { + "epoch": 0.274688, + "grad_norm": 1.4453125, + "learning_rate": 7.311774193548387e-05, + "loss": 0.2193, + "step": 17168 + }, + { + "epoch": 0.274704, + "grad_norm": 1.0546875, + "learning_rate": 7.311612903225806e-05, + "loss": 0.1747, + "step": 17169 + }, + { + "epoch": 0.27472, + "grad_norm": 1.5078125, + "learning_rate": 7.311451612903226e-05, + "loss": 0.2146, + "step": 17170 + }, + { + "epoch": 0.274736, + "grad_norm": 0.50390625, + "learning_rate": 7.311290322580645e-05, + "loss": 0.1749, + "step": 17171 + }, + { + "epoch": 0.274752, + "grad_norm": 0.9765625, + "learning_rate": 7.311129032258064e-05, + "loss": 0.1822, + "step": 17172 + }, + { + "epoch": 0.274768, + "grad_norm": 0.51953125, + "learning_rate": 7.310967741935484e-05, + "loss": 0.1396, + "step": 17173 + }, + { + "epoch": 0.274784, + "grad_norm": 1.046875, + "learning_rate": 7.310806451612904e-05, + "loss": 0.1621, + "step": 17174 + }, + { + "epoch": 0.2748, + "grad_norm": 0.78515625, + "learning_rate": 7.310645161290323e-05, + "loss": 0.1642, + "step": 17175 + }, + { + "epoch": 0.274816, + "grad_norm": 0.91796875, + "learning_rate": 7.310483870967743e-05, + "loss": 0.1794, + "step": 17176 + }, + { + "epoch": 0.274832, + "grad_norm": 0.609375, + "learning_rate": 7.310322580645162e-05, + "loss": 0.1626, + "step": 17177 + }, + { + "epoch": 0.274848, + "grad_norm": 0.89453125, + "learning_rate": 7.310161290322582e-05, + "loss": 0.1698, + "step": 17178 + }, + { + "epoch": 0.274864, + "grad_norm": 0.56640625, + "learning_rate": 7.31e-05, + "loss": 0.1739, + "step": 17179 + }, + { + "epoch": 0.27488, + "grad_norm": 0.796875, + "learning_rate": 7.30983870967742e-05, + "loss": 0.1737, + "step": 17180 + }, + { + "epoch": 0.274896, + "grad_norm": 0.6484375, + "learning_rate": 7.309677419354839e-05, + "loss": 0.1781, + "step": 17181 + }, + { + "epoch": 0.274912, + "grad_norm": 0.89453125, + "learning_rate": 7.309516129032259e-05, + "loss": 0.2049, + "step": 17182 + }, + { + "epoch": 0.274928, + "grad_norm": 0.61328125, + "learning_rate": 7.309354838709677e-05, + "loss": 0.1574, + "step": 17183 + }, + { + "epoch": 0.274944, + "grad_norm": 0.609375, + "learning_rate": 7.309193548387097e-05, + "loss": 0.168, + "step": 17184 + }, + { + "epoch": 0.27496, + "grad_norm": 0.5703125, + "learning_rate": 7.309032258064516e-05, + "loss": 0.198, + "step": 17185 + }, + { + "epoch": 0.274976, + "grad_norm": 0.5859375, + "learning_rate": 7.308870967741936e-05, + "loss": 0.1486, + "step": 17186 + }, + { + "epoch": 0.274992, + "grad_norm": 0.78515625, + "learning_rate": 7.308709677419356e-05, + "loss": 0.2221, + "step": 17187 + }, + { + "epoch": 0.275008, + "grad_norm": 0.8515625, + "learning_rate": 7.308548387096774e-05, + "loss": 0.1846, + "step": 17188 + }, + { + "epoch": 0.275024, + "grad_norm": 0.796875, + "learning_rate": 7.308387096774194e-05, + "loss": 0.1855, + "step": 17189 + }, + { + "epoch": 0.27504, + "grad_norm": 0.6796875, + "learning_rate": 7.308225806451613e-05, + "loss": 0.1487, + "step": 17190 + }, + { + "epoch": 0.275056, + "grad_norm": 0.62890625, + "learning_rate": 7.308064516129033e-05, + "loss": 0.1925, + "step": 17191 + }, + { + "epoch": 0.275072, + "grad_norm": 0.8515625, + "learning_rate": 7.307903225806452e-05, + "loss": 0.1771, + "step": 17192 + }, + { + "epoch": 0.275088, + "grad_norm": 0.85546875, + "learning_rate": 7.307741935483871e-05, + "loss": 0.2015, + "step": 17193 + }, + { + "epoch": 0.275104, + "grad_norm": 0.6328125, + "learning_rate": 7.30758064516129e-05, + "loss": 0.1442, + "step": 17194 + }, + { + "epoch": 0.27512, + "grad_norm": 0.67578125, + "learning_rate": 7.30741935483871e-05, + "loss": 0.1761, + "step": 17195 + }, + { + "epoch": 0.275136, + "grad_norm": 0.953125, + "learning_rate": 7.307258064516129e-05, + "loss": 0.1195, + "step": 17196 + }, + { + "epoch": 0.275152, + "grad_norm": 0.59375, + "learning_rate": 7.307096774193549e-05, + "loss": 0.1204, + "step": 17197 + }, + { + "epoch": 0.275168, + "grad_norm": 0.88671875, + "learning_rate": 7.306935483870969e-05, + "loss": 0.1437, + "step": 17198 + }, + { + "epoch": 0.275184, + "grad_norm": 0.98828125, + "learning_rate": 7.306774193548389e-05, + "loss": 0.1518, + "step": 17199 + }, + { + "epoch": 0.2752, + "grad_norm": 1.21875, + "learning_rate": 7.306612903225807e-05, + "loss": 0.1639, + "step": 17200 + }, + { + "epoch": 0.275216, + "grad_norm": 0.9609375, + "learning_rate": 7.306451612903226e-05, + "loss": 0.1758, + "step": 17201 + }, + { + "epoch": 0.275232, + "grad_norm": 0.87109375, + "learning_rate": 7.306290322580646e-05, + "loss": 0.1605, + "step": 17202 + }, + { + "epoch": 0.275248, + "grad_norm": 0.96875, + "learning_rate": 7.306129032258064e-05, + "loss": 0.1814, + "step": 17203 + }, + { + "epoch": 0.275264, + "grad_norm": 0.79296875, + "learning_rate": 7.305967741935484e-05, + "loss": 0.2131, + "step": 17204 + }, + { + "epoch": 0.27528, + "grad_norm": 0.765625, + "learning_rate": 7.305806451612903e-05, + "loss": 0.1596, + "step": 17205 + }, + { + "epoch": 0.275296, + "grad_norm": 1.1328125, + "learning_rate": 7.305645161290323e-05, + "loss": 0.1615, + "step": 17206 + }, + { + "epoch": 0.275312, + "grad_norm": 0.89453125, + "learning_rate": 7.305483870967741e-05, + "loss": 0.1727, + "step": 17207 + }, + { + "epoch": 0.275328, + "grad_norm": 0.70703125, + "learning_rate": 7.305322580645161e-05, + "loss": 0.1579, + "step": 17208 + }, + { + "epoch": 0.275344, + "grad_norm": 0.80859375, + "learning_rate": 7.305161290322581e-05, + "loss": 0.1942, + "step": 17209 + }, + { + "epoch": 0.27536, + "grad_norm": 0.8828125, + "learning_rate": 7.305000000000001e-05, + "loss": 0.1579, + "step": 17210 + }, + { + "epoch": 0.275376, + "grad_norm": 0.75, + "learning_rate": 7.30483870967742e-05, + "loss": 0.1347, + "step": 17211 + }, + { + "epoch": 0.275392, + "grad_norm": 0.7421875, + "learning_rate": 7.30467741935484e-05, + "loss": 0.1774, + "step": 17212 + }, + { + "epoch": 0.275408, + "grad_norm": 0.482421875, + "learning_rate": 7.304516129032259e-05, + "loss": 0.1562, + "step": 17213 + }, + { + "epoch": 0.275424, + "grad_norm": 1.078125, + "learning_rate": 7.304354838709679e-05, + "loss": 0.1817, + "step": 17214 + }, + { + "epoch": 0.27544, + "grad_norm": 1.203125, + "learning_rate": 7.304193548387097e-05, + "loss": 0.2052, + "step": 17215 + }, + { + "epoch": 0.275456, + "grad_norm": 1.203125, + "learning_rate": 7.304032258064516e-05, + "loss": 0.1795, + "step": 17216 + }, + { + "epoch": 0.275472, + "grad_norm": 1.328125, + "learning_rate": 7.303870967741936e-05, + "loss": 0.1453, + "step": 17217 + }, + { + "epoch": 0.275488, + "grad_norm": 0.6484375, + "learning_rate": 7.303709677419354e-05, + "loss": 0.1549, + "step": 17218 + }, + { + "epoch": 0.275504, + "grad_norm": 0.84375, + "learning_rate": 7.303548387096774e-05, + "loss": 0.1938, + "step": 17219 + }, + { + "epoch": 0.27552, + "grad_norm": 0.8046875, + "learning_rate": 7.303387096774193e-05, + "loss": 0.154, + "step": 17220 + }, + { + "epoch": 0.275536, + "grad_norm": 0.6328125, + "learning_rate": 7.303225806451613e-05, + "loss": 0.1895, + "step": 17221 + }, + { + "epoch": 0.275552, + "grad_norm": 1.5, + "learning_rate": 7.303064516129033e-05, + "loss": 0.204, + "step": 17222 + }, + { + "epoch": 0.275568, + "grad_norm": 0.5234375, + "learning_rate": 7.302903225806453e-05, + "loss": 0.1468, + "step": 17223 + }, + { + "epoch": 0.275584, + "grad_norm": 0.796875, + "learning_rate": 7.302741935483871e-05, + "loss": 0.1821, + "step": 17224 + }, + { + "epoch": 0.2756, + "grad_norm": 0.796875, + "learning_rate": 7.302580645161291e-05, + "loss": 0.1774, + "step": 17225 + }, + { + "epoch": 0.275616, + "grad_norm": 0.921875, + "learning_rate": 7.30241935483871e-05, + "loss": 0.1948, + "step": 17226 + }, + { + "epoch": 0.275632, + "grad_norm": 0.83203125, + "learning_rate": 7.30225806451613e-05, + "loss": 0.187, + "step": 17227 + }, + { + "epoch": 0.275648, + "grad_norm": 1.1015625, + "learning_rate": 7.302096774193549e-05, + "loss": 0.2195, + "step": 17228 + }, + { + "epoch": 0.275664, + "grad_norm": 0.98046875, + "learning_rate": 7.301935483870968e-05, + "loss": 0.147, + "step": 17229 + }, + { + "epoch": 0.27568, + "grad_norm": 0.6953125, + "learning_rate": 7.301774193548387e-05, + "loss": 0.1516, + "step": 17230 + }, + { + "epoch": 0.275696, + "grad_norm": 0.859375, + "learning_rate": 7.301612903225806e-05, + "loss": 0.1643, + "step": 17231 + }, + { + "epoch": 0.275712, + "grad_norm": 0.98828125, + "learning_rate": 7.301451612903226e-05, + "loss": 0.1751, + "step": 17232 + }, + { + "epoch": 0.275728, + "grad_norm": 0.69140625, + "learning_rate": 7.301290322580646e-05, + "loss": 0.1774, + "step": 17233 + }, + { + "epoch": 0.275744, + "grad_norm": 1.21875, + "learning_rate": 7.301129032258066e-05, + "loss": 0.184, + "step": 17234 + }, + { + "epoch": 0.27576, + "grad_norm": 0.6953125, + "learning_rate": 7.300967741935484e-05, + "loss": 0.1591, + "step": 17235 + }, + { + "epoch": 0.275776, + "grad_norm": 0.72265625, + "learning_rate": 7.300806451612904e-05, + "loss": 0.1682, + "step": 17236 + }, + { + "epoch": 0.275792, + "grad_norm": 0.71484375, + "learning_rate": 7.300645161290323e-05, + "loss": 0.1298, + "step": 17237 + }, + { + "epoch": 0.275808, + "grad_norm": 0.95703125, + "learning_rate": 7.300483870967743e-05, + "loss": 0.2038, + "step": 17238 + }, + { + "epoch": 0.275824, + "grad_norm": 0.69921875, + "learning_rate": 7.300322580645161e-05, + "loss": 0.1776, + "step": 17239 + }, + { + "epoch": 0.27584, + "grad_norm": 0.5703125, + "learning_rate": 7.300161290322581e-05, + "loss": 0.1485, + "step": 17240 + }, + { + "epoch": 0.275856, + "grad_norm": 0.77734375, + "learning_rate": 7.3e-05, + "loss": 0.2076, + "step": 17241 + }, + { + "epoch": 0.275872, + "grad_norm": 0.796875, + "learning_rate": 7.29983870967742e-05, + "loss": 0.2094, + "step": 17242 + }, + { + "epoch": 0.275888, + "grad_norm": 0.9375, + "learning_rate": 7.299677419354838e-05, + "loss": 0.164, + "step": 17243 + }, + { + "epoch": 0.275904, + "grad_norm": 1.3125, + "learning_rate": 7.299516129032258e-05, + "loss": 0.2004, + "step": 17244 + }, + { + "epoch": 0.27592, + "grad_norm": 1.140625, + "learning_rate": 7.299354838709678e-05, + "loss": 0.1594, + "step": 17245 + }, + { + "epoch": 0.275936, + "grad_norm": 0.81640625, + "learning_rate": 7.299193548387097e-05, + "loss": 0.2049, + "step": 17246 + }, + { + "epoch": 0.275952, + "grad_norm": 0.72265625, + "learning_rate": 7.299032258064517e-05, + "loss": 0.1532, + "step": 17247 + }, + { + "epoch": 0.275968, + "grad_norm": 0.68359375, + "learning_rate": 7.298870967741936e-05, + "loss": 0.1668, + "step": 17248 + }, + { + "epoch": 0.275984, + "grad_norm": 0.91796875, + "learning_rate": 7.298709677419356e-05, + "loss": 0.2175, + "step": 17249 + }, + { + "epoch": 0.276, + "grad_norm": 0.6875, + "learning_rate": 7.298548387096774e-05, + "loss": 0.1642, + "step": 17250 + }, + { + "epoch": 0.276016, + "grad_norm": 1.2734375, + "learning_rate": 7.298387096774194e-05, + "loss": 0.2048, + "step": 17251 + }, + { + "epoch": 0.276032, + "grad_norm": 0.8125, + "learning_rate": 7.298225806451613e-05, + "loss": 0.1885, + "step": 17252 + }, + { + "epoch": 0.276048, + "grad_norm": 0.6328125, + "learning_rate": 7.298064516129033e-05, + "loss": 0.1618, + "step": 17253 + }, + { + "epoch": 0.276064, + "grad_norm": 0.8515625, + "learning_rate": 7.297903225806451e-05, + "loss": 0.1973, + "step": 17254 + }, + { + "epoch": 0.27608, + "grad_norm": 0.85546875, + "learning_rate": 7.297741935483871e-05, + "loss": 0.1663, + "step": 17255 + }, + { + "epoch": 0.276096, + "grad_norm": 1.03125, + "learning_rate": 7.29758064516129e-05, + "loss": 0.1973, + "step": 17256 + }, + { + "epoch": 0.276112, + "grad_norm": 0.6171875, + "learning_rate": 7.29741935483871e-05, + "loss": 0.1873, + "step": 17257 + }, + { + "epoch": 0.276128, + "grad_norm": 0.5859375, + "learning_rate": 7.29725806451613e-05, + "loss": 0.1778, + "step": 17258 + }, + { + "epoch": 0.276144, + "grad_norm": 1.0078125, + "learning_rate": 7.29709677419355e-05, + "loss": 0.1596, + "step": 17259 + }, + { + "epoch": 0.27616, + "grad_norm": 0.609375, + "learning_rate": 7.296935483870968e-05, + "loss": 0.1793, + "step": 17260 + }, + { + "epoch": 0.276176, + "grad_norm": 0.671875, + "learning_rate": 7.296774193548388e-05, + "loss": 0.1598, + "step": 17261 + }, + { + "epoch": 0.276192, + "grad_norm": 0.7578125, + "learning_rate": 7.296612903225807e-05, + "loss": 0.1813, + "step": 17262 + }, + { + "epoch": 0.276208, + "grad_norm": 0.578125, + "learning_rate": 7.296451612903226e-05, + "loss": 0.1619, + "step": 17263 + }, + { + "epoch": 0.276224, + "grad_norm": 0.5859375, + "learning_rate": 7.296290322580645e-05, + "loss": 0.136, + "step": 17264 + }, + { + "epoch": 0.27624, + "grad_norm": 0.52734375, + "learning_rate": 7.296129032258064e-05, + "loss": 0.1783, + "step": 17265 + }, + { + "epoch": 0.276256, + "grad_norm": 0.9296875, + "learning_rate": 7.295967741935484e-05, + "loss": 0.1541, + "step": 17266 + }, + { + "epoch": 0.276272, + "grad_norm": 2.109375, + "learning_rate": 7.295806451612903e-05, + "loss": 0.1756, + "step": 17267 + }, + { + "epoch": 0.276288, + "grad_norm": 0.66015625, + "learning_rate": 7.295645161290323e-05, + "loss": 0.1817, + "step": 17268 + }, + { + "epoch": 0.276304, + "grad_norm": 1.0078125, + "learning_rate": 7.295483870967743e-05, + "loss": 0.1614, + "step": 17269 + }, + { + "epoch": 0.27632, + "grad_norm": 0.74609375, + "learning_rate": 7.295322580645163e-05, + "loss": 0.1985, + "step": 17270 + }, + { + "epoch": 0.276336, + "grad_norm": 1.0859375, + "learning_rate": 7.295161290322581e-05, + "loss": 0.2037, + "step": 17271 + }, + { + "epoch": 0.276352, + "grad_norm": 1.046875, + "learning_rate": 7.295000000000001e-05, + "loss": 0.1392, + "step": 17272 + }, + { + "epoch": 0.276368, + "grad_norm": 0.6953125, + "learning_rate": 7.29483870967742e-05, + "loss": 0.1732, + "step": 17273 + }, + { + "epoch": 0.276384, + "grad_norm": 0.703125, + "learning_rate": 7.29467741935484e-05, + "loss": 0.1855, + "step": 17274 + }, + { + "epoch": 0.2764, + "grad_norm": 0.53125, + "learning_rate": 7.294516129032258e-05, + "loss": 0.1451, + "step": 17275 + }, + { + "epoch": 0.276416, + "grad_norm": 1.015625, + "learning_rate": 7.294354838709678e-05, + "loss": 0.1732, + "step": 17276 + }, + { + "epoch": 0.276432, + "grad_norm": 0.97265625, + "learning_rate": 7.294193548387097e-05, + "loss": 0.171, + "step": 17277 + }, + { + "epoch": 0.276448, + "grad_norm": 0.7109375, + "learning_rate": 7.294032258064515e-05, + "loss": 0.1465, + "step": 17278 + }, + { + "epoch": 0.276464, + "grad_norm": 0.7265625, + "learning_rate": 7.293870967741935e-05, + "loss": 0.1833, + "step": 17279 + }, + { + "epoch": 0.27648, + "grad_norm": 0.765625, + "learning_rate": 7.293709677419354e-05, + "loss": 0.1536, + "step": 17280 + }, + { + "epoch": 0.276496, + "grad_norm": 0.78515625, + "learning_rate": 7.293548387096774e-05, + "loss": 0.1449, + "step": 17281 + }, + { + "epoch": 0.276512, + "grad_norm": 0.57421875, + "learning_rate": 7.293387096774194e-05, + "loss": 0.1801, + "step": 17282 + }, + { + "epoch": 0.276528, + "grad_norm": 0.71875, + "learning_rate": 7.293225806451614e-05, + "loss": 0.141, + "step": 17283 + }, + { + "epoch": 0.276544, + "grad_norm": 0.89453125, + "learning_rate": 7.293064516129033e-05, + "loss": 0.1791, + "step": 17284 + }, + { + "epoch": 0.27656, + "grad_norm": 0.8359375, + "learning_rate": 7.292903225806453e-05, + "loss": 0.1546, + "step": 17285 + }, + { + "epoch": 0.276576, + "grad_norm": 0.953125, + "learning_rate": 7.292741935483871e-05, + "loss": 0.1877, + "step": 17286 + }, + { + "epoch": 0.276592, + "grad_norm": 1.15625, + "learning_rate": 7.292580645161291e-05, + "loss": 0.1439, + "step": 17287 + }, + { + "epoch": 0.276608, + "grad_norm": 0.63671875, + "learning_rate": 7.29241935483871e-05, + "loss": 0.1411, + "step": 17288 + }, + { + "epoch": 0.276624, + "grad_norm": 0.92578125, + "learning_rate": 7.29225806451613e-05, + "loss": 0.1901, + "step": 17289 + }, + { + "epoch": 0.27664, + "grad_norm": 0.66796875, + "learning_rate": 7.292096774193548e-05, + "loss": 0.1589, + "step": 17290 + }, + { + "epoch": 0.276656, + "grad_norm": 0.80859375, + "learning_rate": 7.291935483870968e-05, + "loss": 0.1899, + "step": 17291 + }, + { + "epoch": 0.276672, + "grad_norm": 0.9140625, + "learning_rate": 7.291774193548387e-05, + "loss": 0.1673, + "step": 17292 + }, + { + "epoch": 0.276688, + "grad_norm": 1.09375, + "learning_rate": 7.291612903225807e-05, + "loss": 0.1934, + "step": 17293 + }, + { + "epoch": 0.276704, + "grad_norm": 0.7109375, + "learning_rate": 7.291451612903227e-05, + "loss": 0.1408, + "step": 17294 + }, + { + "epoch": 0.27672, + "grad_norm": 0.83203125, + "learning_rate": 7.291290322580645e-05, + "loss": 0.1831, + "step": 17295 + }, + { + "epoch": 0.276736, + "grad_norm": 0.66015625, + "learning_rate": 7.291129032258065e-05, + "loss": 0.1517, + "step": 17296 + }, + { + "epoch": 0.276752, + "grad_norm": 1.1484375, + "learning_rate": 7.290967741935484e-05, + "loss": 0.1572, + "step": 17297 + }, + { + "epoch": 0.276768, + "grad_norm": 0.7890625, + "learning_rate": 7.290806451612904e-05, + "loss": 0.1692, + "step": 17298 + }, + { + "epoch": 0.276784, + "grad_norm": 0.57421875, + "learning_rate": 7.290645161290323e-05, + "loss": 0.1724, + "step": 17299 + }, + { + "epoch": 0.2768, + "grad_norm": 0.5703125, + "learning_rate": 7.290483870967742e-05, + "loss": 0.1537, + "step": 17300 + }, + { + "epoch": 0.276816, + "grad_norm": 0.73046875, + "learning_rate": 7.290322580645161e-05, + "loss": 0.1569, + "step": 17301 + }, + { + "epoch": 0.276832, + "grad_norm": 0.65625, + "learning_rate": 7.290161290322581e-05, + "loss": 0.162, + "step": 17302 + }, + { + "epoch": 0.276848, + "grad_norm": 0.62890625, + "learning_rate": 7.29e-05, + "loss": 0.1734, + "step": 17303 + }, + { + "epoch": 0.276864, + "grad_norm": 0.96875, + "learning_rate": 7.28983870967742e-05, + "loss": 0.1401, + "step": 17304 + }, + { + "epoch": 0.27688, + "grad_norm": 0.69140625, + "learning_rate": 7.28967741935484e-05, + "loss": 0.1927, + "step": 17305 + }, + { + "epoch": 0.276896, + "grad_norm": 0.8671875, + "learning_rate": 7.28951612903226e-05, + "loss": 0.145, + "step": 17306 + }, + { + "epoch": 0.276912, + "grad_norm": 0.7265625, + "learning_rate": 7.289354838709678e-05, + "loss": 0.1593, + "step": 17307 + }, + { + "epoch": 0.276928, + "grad_norm": 0.73828125, + "learning_rate": 7.289193548387098e-05, + "loss": 0.1444, + "step": 17308 + }, + { + "epoch": 0.276944, + "grad_norm": 0.640625, + "learning_rate": 7.289032258064517e-05, + "loss": 0.1641, + "step": 17309 + }, + { + "epoch": 0.27696, + "grad_norm": 0.671875, + "learning_rate": 7.288870967741935e-05, + "loss": 0.1593, + "step": 17310 + }, + { + "epoch": 0.276976, + "grad_norm": 0.7734375, + "learning_rate": 7.288709677419355e-05, + "loss": 0.1462, + "step": 17311 + }, + { + "epoch": 0.276992, + "grad_norm": 0.796875, + "learning_rate": 7.288548387096774e-05, + "loss": 0.1431, + "step": 17312 + }, + { + "epoch": 0.277008, + "grad_norm": 0.70703125, + "learning_rate": 7.288387096774194e-05, + "loss": 0.1976, + "step": 17313 + }, + { + "epoch": 0.277024, + "grad_norm": 0.609375, + "learning_rate": 7.288225806451612e-05, + "loss": 0.1851, + "step": 17314 + }, + { + "epoch": 0.27704, + "grad_norm": 0.96875, + "learning_rate": 7.288064516129032e-05, + "loss": 0.1555, + "step": 17315 + }, + { + "epoch": 0.277056, + "grad_norm": 0.8828125, + "learning_rate": 7.287903225806451e-05, + "loss": 0.1423, + "step": 17316 + }, + { + "epoch": 0.277072, + "grad_norm": 0.69921875, + "learning_rate": 7.287741935483871e-05, + "loss": 0.1728, + "step": 17317 + }, + { + "epoch": 0.277088, + "grad_norm": 0.765625, + "learning_rate": 7.287580645161291e-05, + "loss": 0.182, + "step": 17318 + }, + { + "epoch": 0.277104, + "grad_norm": 0.625, + "learning_rate": 7.287419354838711e-05, + "loss": 0.1739, + "step": 17319 + }, + { + "epoch": 0.27712, + "grad_norm": 0.7578125, + "learning_rate": 7.28725806451613e-05, + "loss": 0.1772, + "step": 17320 + }, + { + "epoch": 0.277136, + "grad_norm": 0.74609375, + "learning_rate": 7.28709677419355e-05, + "loss": 0.2037, + "step": 17321 + }, + { + "epoch": 0.277152, + "grad_norm": 0.7421875, + "learning_rate": 7.286935483870968e-05, + "loss": 0.1923, + "step": 17322 + }, + { + "epoch": 0.277168, + "grad_norm": 0.703125, + "learning_rate": 7.286774193548388e-05, + "loss": 0.1634, + "step": 17323 + }, + { + "epoch": 0.277184, + "grad_norm": 0.70703125, + "learning_rate": 7.286612903225807e-05, + "loss": 0.1613, + "step": 17324 + }, + { + "epoch": 0.2772, + "grad_norm": 0.78125, + "learning_rate": 7.286451612903225e-05, + "loss": 0.1587, + "step": 17325 + }, + { + "epoch": 0.277216, + "grad_norm": 1.0078125, + "learning_rate": 7.286290322580645e-05, + "loss": 0.1539, + "step": 17326 + }, + { + "epoch": 0.277232, + "grad_norm": 0.83984375, + "learning_rate": 7.286129032258064e-05, + "loss": 0.1775, + "step": 17327 + }, + { + "epoch": 0.277248, + "grad_norm": 0.69921875, + "learning_rate": 7.285967741935484e-05, + "loss": 0.1636, + "step": 17328 + }, + { + "epoch": 0.277264, + "grad_norm": 0.8359375, + "learning_rate": 7.285806451612904e-05, + "loss": 0.1432, + "step": 17329 + }, + { + "epoch": 0.27728, + "grad_norm": 0.76171875, + "learning_rate": 7.285645161290324e-05, + "loss": 0.18, + "step": 17330 + }, + { + "epoch": 0.277296, + "grad_norm": 0.71484375, + "learning_rate": 7.285483870967742e-05, + "loss": 0.2083, + "step": 17331 + }, + { + "epoch": 0.277312, + "grad_norm": 0.8984375, + "learning_rate": 7.285322580645162e-05, + "loss": 0.1916, + "step": 17332 + }, + { + "epoch": 0.277328, + "grad_norm": 0.97265625, + "learning_rate": 7.285161290322581e-05, + "loss": 0.1671, + "step": 17333 + }, + { + "epoch": 0.277344, + "grad_norm": 0.81640625, + "learning_rate": 7.285000000000001e-05, + "loss": 0.1993, + "step": 17334 + }, + { + "epoch": 0.27736, + "grad_norm": 0.9609375, + "learning_rate": 7.28483870967742e-05, + "loss": 0.1874, + "step": 17335 + }, + { + "epoch": 0.277376, + "grad_norm": 1.0078125, + "learning_rate": 7.28467741935484e-05, + "loss": 0.186, + "step": 17336 + }, + { + "epoch": 0.277392, + "grad_norm": 0.73828125, + "learning_rate": 7.284516129032258e-05, + "loss": 0.1669, + "step": 17337 + }, + { + "epoch": 0.277408, + "grad_norm": 1.25, + "learning_rate": 7.284354838709678e-05, + "loss": 0.224, + "step": 17338 + }, + { + "epoch": 0.277424, + "grad_norm": 0.78125, + "learning_rate": 7.284193548387097e-05, + "loss": 0.1772, + "step": 17339 + }, + { + "epoch": 0.27744, + "grad_norm": 0.8984375, + "learning_rate": 7.284032258064517e-05, + "loss": 0.1758, + "step": 17340 + }, + { + "epoch": 0.277456, + "grad_norm": 0.6796875, + "learning_rate": 7.283870967741937e-05, + "loss": 0.1648, + "step": 17341 + }, + { + "epoch": 0.277472, + "grad_norm": 1.0234375, + "learning_rate": 7.283709677419355e-05, + "loss": 0.1995, + "step": 17342 + }, + { + "epoch": 0.277488, + "grad_norm": 0.828125, + "learning_rate": 7.283548387096775e-05, + "loss": 0.1681, + "step": 17343 + }, + { + "epoch": 0.277504, + "grad_norm": 0.8515625, + "learning_rate": 7.283387096774194e-05, + "loss": 0.1694, + "step": 17344 + }, + { + "epoch": 0.27752, + "grad_norm": 0.9609375, + "learning_rate": 7.283225806451614e-05, + "loss": 0.2197, + "step": 17345 + }, + { + "epoch": 0.277536, + "grad_norm": 1.1015625, + "learning_rate": 7.283064516129032e-05, + "loss": 0.1929, + "step": 17346 + }, + { + "epoch": 0.277552, + "grad_norm": 0.83203125, + "learning_rate": 7.282903225806452e-05, + "loss": 0.181, + "step": 17347 + }, + { + "epoch": 0.277568, + "grad_norm": 0.5625, + "learning_rate": 7.282741935483871e-05, + "loss": 0.1527, + "step": 17348 + }, + { + "epoch": 0.277584, + "grad_norm": 1.0546875, + "learning_rate": 7.282580645161291e-05, + "loss": 0.1867, + "step": 17349 + }, + { + "epoch": 0.2776, + "grad_norm": 1.0078125, + "learning_rate": 7.28241935483871e-05, + "loss": 0.1682, + "step": 17350 + }, + { + "epoch": 0.277616, + "grad_norm": 0.78515625, + "learning_rate": 7.28225806451613e-05, + "loss": 0.1644, + "step": 17351 + }, + { + "epoch": 0.277632, + "grad_norm": 0.984375, + "learning_rate": 7.282096774193548e-05, + "loss": 0.2049, + "step": 17352 + }, + { + "epoch": 0.277648, + "grad_norm": 0.56640625, + "learning_rate": 7.281935483870968e-05, + "loss": 0.1546, + "step": 17353 + }, + { + "epoch": 0.277664, + "grad_norm": 0.6953125, + "learning_rate": 7.281774193548388e-05, + "loss": 0.1623, + "step": 17354 + }, + { + "epoch": 0.27768, + "grad_norm": 1.078125, + "learning_rate": 7.281612903225807e-05, + "loss": 0.156, + "step": 17355 + }, + { + "epoch": 0.277696, + "grad_norm": 0.80078125, + "learning_rate": 7.281451612903227e-05, + "loss": 0.1511, + "step": 17356 + }, + { + "epoch": 0.277712, + "grad_norm": 0.76953125, + "learning_rate": 7.281290322580645e-05, + "loss": 0.1456, + "step": 17357 + }, + { + "epoch": 0.277728, + "grad_norm": 0.69921875, + "learning_rate": 7.281129032258065e-05, + "loss": 0.125, + "step": 17358 + }, + { + "epoch": 0.277744, + "grad_norm": 0.734375, + "learning_rate": 7.280967741935484e-05, + "loss": 0.1555, + "step": 17359 + }, + { + "epoch": 0.27776, + "grad_norm": 0.76171875, + "learning_rate": 7.280806451612904e-05, + "loss": 0.168, + "step": 17360 + }, + { + "epoch": 0.277776, + "grad_norm": 0.57421875, + "learning_rate": 7.280645161290322e-05, + "loss": 0.1636, + "step": 17361 + }, + { + "epoch": 0.277792, + "grad_norm": 0.82421875, + "learning_rate": 7.280483870967742e-05, + "loss": 0.1516, + "step": 17362 + }, + { + "epoch": 0.277808, + "grad_norm": 0.546875, + "learning_rate": 7.280322580645161e-05, + "loss": 0.1774, + "step": 17363 + }, + { + "epoch": 0.277824, + "grad_norm": 0.8125, + "learning_rate": 7.280161290322581e-05, + "loss": 0.1647, + "step": 17364 + }, + { + "epoch": 0.27784, + "grad_norm": 0.74609375, + "learning_rate": 7.280000000000001e-05, + "loss": 0.1777, + "step": 17365 + }, + { + "epoch": 0.277856, + "grad_norm": 0.6953125, + "learning_rate": 7.279838709677421e-05, + "loss": 0.1557, + "step": 17366 + }, + { + "epoch": 0.277872, + "grad_norm": 0.8828125, + "learning_rate": 7.27967741935484e-05, + "loss": 0.1904, + "step": 17367 + }, + { + "epoch": 0.277888, + "grad_norm": 0.8046875, + "learning_rate": 7.279516129032259e-05, + "loss": 0.1538, + "step": 17368 + }, + { + "epoch": 0.277904, + "grad_norm": 0.7421875, + "learning_rate": 7.279354838709678e-05, + "loss": 0.1829, + "step": 17369 + }, + { + "epoch": 0.27792, + "grad_norm": 1.0, + "learning_rate": 7.279193548387098e-05, + "loss": 0.1609, + "step": 17370 + }, + { + "epoch": 0.277936, + "grad_norm": 0.62109375, + "learning_rate": 7.279032258064516e-05, + "loss": 0.1843, + "step": 17371 + }, + { + "epoch": 0.277952, + "grad_norm": 0.6875, + "learning_rate": 7.278870967741935e-05, + "loss": 0.1656, + "step": 17372 + }, + { + "epoch": 0.277968, + "grad_norm": 1.1171875, + "learning_rate": 7.278709677419355e-05, + "loss": 0.1952, + "step": 17373 + }, + { + "epoch": 0.277984, + "grad_norm": 0.76953125, + "learning_rate": 7.278548387096774e-05, + "loss": 0.1804, + "step": 17374 + }, + { + "epoch": 0.278, + "grad_norm": 0.80078125, + "learning_rate": 7.278387096774194e-05, + "loss": 0.1812, + "step": 17375 + }, + { + "epoch": 0.278016, + "grad_norm": 0.65234375, + "learning_rate": 7.278225806451612e-05, + "loss": 0.1996, + "step": 17376 + }, + { + "epoch": 0.278032, + "grad_norm": 0.96875, + "learning_rate": 7.278064516129032e-05, + "loss": 0.2021, + "step": 17377 + }, + { + "epoch": 0.278048, + "grad_norm": 0.62890625, + "learning_rate": 7.277903225806452e-05, + "loss": 0.148, + "step": 17378 + }, + { + "epoch": 0.278064, + "grad_norm": 0.96484375, + "learning_rate": 7.277741935483872e-05, + "loss": 0.1784, + "step": 17379 + }, + { + "epoch": 0.27808, + "grad_norm": 0.95703125, + "learning_rate": 7.277580645161291e-05, + "loss": 0.1587, + "step": 17380 + }, + { + "epoch": 0.278096, + "grad_norm": 0.86328125, + "learning_rate": 7.277419354838711e-05, + "loss": 0.2162, + "step": 17381 + }, + { + "epoch": 0.278112, + "grad_norm": 1.296875, + "learning_rate": 7.277258064516129e-05, + "loss": 0.1846, + "step": 17382 + }, + { + "epoch": 0.278128, + "grad_norm": 1.1015625, + "learning_rate": 7.277096774193549e-05, + "loss": 0.1783, + "step": 17383 + }, + { + "epoch": 0.278144, + "grad_norm": 0.66015625, + "learning_rate": 7.276935483870968e-05, + "loss": 0.1538, + "step": 17384 + }, + { + "epoch": 0.27816, + "grad_norm": 0.6796875, + "learning_rate": 7.276774193548388e-05, + "loss": 0.1654, + "step": 17385 + }, + { + "epoch": 0.278176, + "grad_norm": 0.6875, + "learning_rate": 7.276612903225806e-05, + "loss": 0.1585, + "step": 17386 + }, + { + "epoch": 0.278192, + "grad_norm": 1.109375, + "learning_rate": 7.276451612903225e-05, + "loss": 0.1872, + "step": 17387 + }, + { + "epoch": 0.278208, + "grad_norm": 0.71484375, + "learning_rate": 7.276290322580645e-05, + "loss": 0.169, + "step": 17388 + }, + { + "epoch": 0.278224, + "grad_norm": 0.9453125, + "learning_rate": 7.276129032258065e-05, + "loss": 0.19, + "step": 17389 + }, + { + "epoch": 0.27824, + "grad_norm": 0.84765625, + "learning_rate": 7.275967741935485e-05, + "loss": 0.1791, + "step": 17390 + }, + { + "epoch": 0.278256, + "grad_norm": 0.98046875, + "learning_rate": 7.275806451612904e-05, + "loss": 0.1437, + "step": 17391 + }, + { + "epoch": 0.278272, + "grad_norm": 0.55078125, + "learning_rate": 7.275645161290324e-05, + "loss": 0.1644, + "step": 17392 + }, + { + "epoch": 0.278288, + "grad_norm": 0.875, + "learning_rate": 7.275483870967742e-05, + "loss": 0.1585, + "step": 17393 + }, + { + "epoch": 0.278304, + "grad_norm": 0.859375, + "learning_rate": 7.275322580645162e-05, + "loss": 0.1732, + "step": 17394 + }, + { + "epoch": 0.27832, + "grad_norm": 1.234375, + "learning_rate": 7.275161290322581e-05, + "loss": 0.1216, + "step": 17395 + }, + { + "epoch": 0.278336, + "grad_norm": 0.5078125, + "learning_rate": 7.275e-05, + "loss": 0.1398, + "step": 17396 + }, + { + "epoch": 0.278352, + "grad_norm": 0.7421875, + "learning_rate": 7.274838709677419e-05, + "loss": 0.1201, + "step": 17397 + }, + { + "epoch": 0.278368, + "grad_norm": 0.76953125, + "learning_rate": 7.274677419354839e-05, + "loss": 0.1759, + "step": 17398 + }, + { + "epoch": 0.278384, + "grad_norm": 0.87890625, + "learning_rate": 7.274516129032258e-05, + "loss": 0.1726, + "step": 17399 + }, + { + "epoch": 0.2784, + "grad_norm": 0.9609375, + "learning_rate": 7.274354838709678e-05, + "loss": 0.1436, + "step": 17400 + }, + { + "epoch": 0.278416, + "grad_norm": 1.1875, + "learning_rate": 7.274193548387098e-05, + "loss": 0.2321, + "step": 17401 + }, + { + "epoch": 0.278432, + "grad_norm": 1.0078125, + "learning_rate": 7.274032258064516e-05, + "loss": 0.1969, + "step": 17402 + }, + { + "epoch": 0.278448, + "grad_norm": 0.55078125, + "learning_rate": 7.273870967741936e-05, + "loss": 0.1332, + "step": 17403 + }, + { + "epoch": 0.278464, + "grad_norm": 1.21875, + "learning_rate": 7.273709677419355e-05, + "loss": 0.2605, + "step": 17404 + }, + { + "epoch": 0.27848, + "grad_norm": 1.15625, + "learning_rate": 7.273548387096775e-05, + "loss": 0.1469, + "step": 17405 + }, + { + "epoch": 0.278496, + "grad_norm": 0.9453125, + "learning_rate": 7.273387096774194e-05, + "loss": 0.1831, + "step": 17406 + }, + { + "epoch": 0.278512, + "grad_norm": 0.59765625, + "learning_rate": 7.273225806451613e-05, + "loss": 0.1835, + "step": 17407 + }, + { + "epoch": 0.278528, + "grad_norm": 0.67578125, + "learning_rate": 7.273064516129032e-05, + "loss": 0.1623, + "step": 17408 + }, + { + "epoch": 0.278544, + "grad_norm": 0.5546875, + "learning_rate": 7.272903225806452e-05, + "loss": 0.1572, + "step": 17409 + }, + { + "epoch": 0.27856, + "grad_norm": 0.58203125, + "learning_rate": 7.27274193548387e-05, + "loss": 0.1359, + "step": 17410 + }, + { + "epoch": 0.278576, + "grad_norm": 0.83203125, + "learning_rate": 7.27258064516129e-05, + "loss": 0.0962, + "step": 17411 + }, + { + "epoch": 0.278592, + "grad_norm": 1.2734375, + "learning_rate": 7.272419354838709e-05, + "loss": 0.2104, + "step": 17412 + }, + { + "epoch": 0.278608, + "grad_norm": 0.56640625, + "learning_rate": 7.272258064516129e-05, + "loss": 0.1666, + "step": 17413 + }, + { + "epoch": 0.278624, + "grad_norm": 0.546875, + "learning_rate": 7.272096774193549e-05, + "loss": 0.1551, + "step": 17414 + }, + { + "epoch": 0.27864, + "grad_norm": 0.76171875, + "learning_rate": 7.271935483870969e-05, + "loss": 0.2027, + "step": 17415 + }, + { + "epoch": 0.278656, + "grad_norm": 1.015625, + "learning_rate": 7.271774193548388e-05, + "loss": 0.2039, + "step": 17416 + }, + { + "epoch": 0.278672, + "grad_norm": 1.1171875, + "learning_rate": 7.271612903225808e-05, + "loss": 0.1594, + "step": 17417 + }, + { + "epoch": 0.278688, + "grad_norm": 0.5546875, + "learning_rate": 7.271451612903226e-05, + "loss": 0.1684, + "step": 17418 + }, + { + "epoch": 0.278704, + "grad_norm": 0.7265625, + "learning_rate": 7.271290322580645e-05, + "loss": 0.1913, + "step": 17419 + }, + { + "epoch": 0.27872, + "grad_norm": 0.94140625, + "learning_rate": 7.271129032258065e-05, + "loss": 0.1487, + "step": 17420 + }, + { + "epoch": 0.278736, + "grad_norm": 0.8046875, + "learning_rate": 7.270967741935483e-05, + "loss": 0.1905, + "step": 17421 + }, + { + "epoch": 0.278752, + "grad_norm": 0.74609375, + "learning_rate": 7.270806451612903e-05, + "loss": 0.1402, + "step": 17422 + }, + { + "epoch": 0.278768, + "grad_norm": 1.3046875, + "learning_rate": 7.270645161290322e-05, + "loss": 0.23, + "step": 17423 + }, + { + "epoch": 0.278784, + "grad_norm": 0.62109375, + "learning_rate": 7.270483870967742e-05, + "loss": 0.1699, + "step": 17424 + }, + { + "epoch": 0.2788, + "grad_norm": 1.046875, + "learning_rate": 7.270322580645162e-05, + "loss": 0.1664, + "step": 17425 + }, + { + "epoch": 0.278816, + "grad_norm": 0.69921875, + "learning_rate": 7.270161290322582e-05, + "loss": 0.1416, + "step": 17426 + }, + { + "epoch": 0.278832, + "grad_norm": 0.8203125, + "learning_rate": 7.27e-05, + "loss": 0.1409, + "step": 17427 + }, + { + "epoch": 0.278848, + "grad_norm": 1.171875, + "learning_rate": 7.26983870967742e-05, + "loss": 0.1931, + "step": 17428 + }, + { + "epoch": 0.278864, + "grad_norm": 0.75, + "learning_rate": 7.269677419354839e-05, + "loss": 0.136, + "step": 17429 + }, + { + "epoch": 0.27888, + "grad_norm": 0.71484375, + "learning_rate": 7.269516129032259e-05, + "loss": 0.1893, + "step": 17430 + }, + { + "epoch": 0.278896, + "grad_norm": 0.66015625, + "learning_rate": 7.269354838709678e-05, + "loss": 0.1647, + "step": 17431 + }, + { + "epoch": 0.278912, + "grad_norm": 0.90625, + "learning_rate": 7.269193548387098e-05, + "loss": 0.154, + "step": 17432 + }, + { + "epoch": 0.278928, + "grad_norm": 0.69140625, + "learning_rate": 7.269032258064516e-05, + "loss": 0.1726, + "step": 17433 + }, + { + "epoch": 0.278944, + "grad_norm": 0.7109375, + "learning_rate": 7.268870967741935e-05, + "loss": 0.1871, + "step": 17434 + }, + { + "epoch": 0.27896, + "grad_norm": 1.5234375, + "learning_rate": 7.268709677419355e-05, + "loss": 0.1508, + "step": 17435 + }, + { + "epoch": 0.278976, + "grad_norm": 0.734375, + "learning_rate": 7.268548387096775e-05, + "loss": 0.1409, + "step": 17436 + }, + { + "epoch": 0.278992, + "grad_norm": 0.87109375, + "learning_rate": 7.268387096774193e-05, + "loss": 0.1716, + "step": 17437 + }, + { + "epoch": 0.279008, + "grad_norm": 0.99609375, + "learning_rate": 7.268225806451613e-05, + "loss": 0.1404, + "step": 17438 + }, + { + "epoch": 0.279024, + "grad_norm": 0.7578125, + "learning_rate": 7.268064516129033e-05, + "loss": 0.1743, + "step": 17439 + }, + { + "epoch": 0.27904, + "grad_norm": 0.6171875, + "learning_rate": 7.267903225806452e-05, + "loss": 0.1615, + "step": 17440 + }, + { + "epoch": 0.279056, + "grad_norm": 0.73828125, + "learning_rate": 7.267741935483872e-05, + "loss": 0.1463, + "step": 17441 + }, + { + "epoch": 0.279072, + "grad_norm": 1.1328125, + "learning_rate": 7.26758064516129e-05, + "loss": 0.1652, + "step": 17442 + }, + { + "epoch": 0.279088, + "grad_norm": 1.15625, + "learning_rate": 7.26741935483871e-05, + "loss": 0.1701, + "step": 17443 + }, + { + "epoch": 0.279104, + "grad_norm": 0.59375, + "learning_rate": 7.267258064516129e-05, + "loss": 0.1685, + "step": 17444 + }, + { + "epoch": 0.27912, + "grad_norm": 0.96484375, + "learning_rate": 7.267096774193549e-05, + "loss": 0.1659, + "step": 17445 + }, + { + "epoch": 0.279136, + "grad_norm": 0.54296875, + "learning_rate": 7.266935483870968e-05, + "loss": 0.1534, + "step": 17446 + }, + { + "epoch": 0.279152, + "grad_norm": 0.6015625, + "learning_rate": 7.266774193548388e-05, + "loss": 0.1466, + "step": 17447 + }, + { + "epoch": 0.279168, + "grad_norm": 1.09375, + "learning_rate": 7.266612903225806e-05, + "loss": 0.1387, + "step": 17448 + }, + { + "epoch": 0.279184, + "grad_norm": 1.0546875, + "learning_rate": 7.266451612903226e-05, + "loss": 0.1782, + "step": 17449 + }, + { + "epoch": 0.2792, + "grad_norm": 0.734375, + "learning_rate": 7.266290322580646e-05, + "loss": 0.1683, + "step": 17450 + }, + { + "epoch": 0.279216, + "grad_norm": 1.21875, + "learning_rate": 7.266129032258065e-05, + "loss": 0.1894, + "step": 17451 + }, + { + "epoch": 0.279232, + "grad_norm": 0.82421875, + "learning_rate": 7.265967741935485e-05, + "loss": 0.1921, + "step": 17452 + }, + { + "epoch": 0.279248, + "grad_norm": 1.1328125, + "learning_rate": 7.265806451612903e-05, + "loss": 0.1414, + "step": 17453 + }, + { + "epoch": 0.279264, + "grad_norm": 1.7734375, + "learning_rate": 7.265645161290323e-05, + "loss": 0.1969, + "step": 17454 + }, + { + "epoch": 0.27928, + "grad_norm": 1.1015625, + "learning_rate": 7.265483870967742e-05, + "loss": 0.164, + "step": 17455 + }, + { + "epoch": 0.279296, + "grad_norm": 0.76171875, + "learning_rate": 7.265322580645162e-05, + "loss": 0.1534, + "step": 17456 + }, + { + "epoch": 0.279312, + "grad_norm": 0.65625, + "learning_rate": 7.26516129032258e-05, + "loss": 0.1365, + "step": 17457 + }, + { + "epoch": 0.279328, + "grad_norm": 0.75390625, + "learning_rate": 7.265e-05, + "loss": 0.161, + "step": 17458 + }, + { + "epoch": 0.279344, + "grad_norm": 0.625, + "learning_rate": 7.264838709677419e-05, + "loss": 0.1527, + "step": 17459 + }, + { + "epoch": 0.27936, + "grad_norm": 0.55859375, + "learning_rate": 7.264677419354839e-05, + "loss": 0.1462, + "step": 17460 + }, + { + "epoch": 0.279376, + "grad_norm": 0.69921875, + "learning_rate": 7.264516129032259e-05, + "loss": 0.1756, + "step": 17461 + }, + { + "epoch": 0.279392, + "grad_norm": 0.83203125, + "learning_rate": 7.264354838709679e-05, + "loss": 0.1848, + "step": 17462 + }, + { + "epoch": 0.279408, + "grad_norm": 0.72265625, + "learning_rate": 7.264193548387098e-05, + "loss": 0.1915, + "step": 17463 + }, + { + "epoch": 0.279424, + "grad_norm": 1.046875, + "learning_rate": 7.264032258064516e-05, + "loss": 0.169, + "step": 17464 + }, + { + "epoch": 0.27944, + "grad_norm": 1.21875, + "learning_rate": 7.263870967741936e-05, + "loss": 0.1882, + "step": 17465 + }, + { + "epoch": 0.279456, + "grad_norm": 0.640625, + "learning_rate": 7.263709677419355e-05, + "loss": 0.1409, + "step": 17466 + }, + { + "epoch": 0.279472, + "grad_norm": 0.9296875, + "learning_rate": 7.263548387096775e-05, + "loss": 0.1854, + "step": 17467 + }, + { + "epoch": 0.279488, + "grad_norm": 1.0625, + "learning_rate": 7.263387096774193e-05, + "loss": 0.1706, + "step": 17468 + }, + { + "epoch": 0.279504, + "grad_norm": 1.0, + "learning_rate": 7.263225806451613e-05, + "loss": 0.1582, + "step": 17469 + }, + { + "epoch": 0.27952, + "grad_norm": 0.66015625, + "learning_rate": 7.263064516129032e-05, + "loss": 0.13, + "step": 17470 + }, + { + "epoch": 0.279536, + "grad_norm": 0.84375, + "learning_rate": 7.262903225806452e-05, + "loss": 0.1621, + "step": 17471 + }, + { + "epoch": 0.279552, + "grad_norm": 0.5234375, + "learning_rate": 7.26274193548387e-05, + "loss": 0.131, + "step": 17472 + }, + { + "epoch": 0.279568, + "grad_norm": 1.0625, + "learning_rate": 7.26258064516129e-05, + "loss": 0.1892, + "step": 17473 + }, + { + "epoch": 0.279584, + "grad_norm": 1.0625, + "learning_rate": 7.26241935483871e-05, + "loss": 0.2122, + "step": 17474 + }, + { + "epoch": 0.2796, + "grad_norm": 1.2421875, + "learning_rate": 7.26225806451613e-05, + "loss": 0.1742, + "step": 17475 + }, + { + "epoch": 0.279616, + "grad_norm": 0.97265625, + "learning_rate": 7.262096774193549e-05, + "loss": 0.1962, + "step": 17476 + }, + { + "epoch": 0.279632, + "grad_norm": 1.1015625, + "learning_rate": 7.261935483870969e-05, + "loss": 0.2189, + "step": 17477 + }, + { + "epoch": 0.279648, + "grad_norm": 1.3671875, + "learning_rate": 7.261774193548387e-05, + "loss": 0.162, + "step": 17478 + }, + { + "epoch": 0.279664, + "grad_norm": 0.9765625, + "learning_rate": 7.261612903225807e-05, + "loss": 0.1509, + "step": 17479 + }, + { + "epoch": 0.27968, + "grad_norm": 0.73828125, + "learning_rate": 7.261451612903226e-05, + "loss": 0.2175, + "step": 17480 + }, + { + "epoch": 0.279696, + "grad_norm": 0.828125, + "learning_rate": 7.261290322580645e-05, + "loss": 0.175, + "step": 17481 + }, + { + "epoch": 0.279712, + "grad_norm": 0.6953125, + "learning_rate": 7.261129032258065e-05, + "loss": 0.1608, + "step": 17482 + }, + { + "epoch": 0.279728, + "grad_norm": 0.58203125, + "learning_rate": 7.260967741935483e-05, + "loss": 0.1615, + "step": 17483 + }, + { + "epoch": 0.279744, + "grad_norm": 1.0546875, + "learning_rate": 7.260806451612903e-05, + "loss": 0.1651, + "step": 17484 + }, + { + "epoch": 0.27976, + "grad_norm": 0.69921875, + "learning_rate": 7.260645161290323e-05, + "loss": 0.1702, + "step": 17485 + }, + { + "epoch": 0.279776, + "grad_norm": 0.6015625, + "learning_rate": 7.260483870967743e-05, + "loss": 0.1643, + "step": 17486 + }, + { + "epoch": 0.279792, + "grad_norm": 1.0625, + "learning_rate": 7.260322580645162e-05, + "loss": 0.1547, + "step": 17487 + }, + { + "epoch": 0.279808, + "grad_norm": 1.0390625, + "learning_rate": 7.260161290322582e-05, + "loss": 0.1497, + "step": 17488 + }, + { + "epoch": 0.279824, + "grad_norm": 1.28125, + "learning_rate": 7.26e-05, + "loss": 0.2584, + "step": 17489 + }, + { + "epoch": 0.27984, + "grad_norm": 1.046875, + "learning_rate": 7.25983870967742e-05, + "loss": 0.2104, + "step": 17490 + }, + { + "epoch": 0.279856, + "grad_norm": 0.625, + "learning_rate": 7.259677419354839e-05, + "loss": 0.1807, + "step": 17491 + }, + { + "epoch": 0.279872, + "grad_norm": 0.8203125, + "learning_rate": 7.259516129032259e-05, + "loss": 0.1622, + "step": 17492 + }, + { + "epoch": 0.279888, + "grad_norm": 0.65234375, + "learning_rate": 7.259354838709677e-05, + "loss": 0.1355, + "step": 17493 + }, + { + "epoch": 0.279904, + "grad_norm": 0.58984375, + "learning_rate": 7.259193548387097e-05, + "loss": 0.1534, + "step": 17494 + }, + { + "epoch": 0.27992, + "grad_norm": 0.953125, + "learning_rate": 7.259032258064516e-05, + "loss": 0.2233, + "step": 17495 + }, + { + "epoch": 0.279936, + "grad_norm": 0.78515625, + "learning_rate": 7.258870967741936e-05, + "loss": 0.1672, + "step": 17496 + }, + { + "epoch": 0.279952, + "grad_norm": 1.0234375, + "learning_rate": 7.258709677419356e-05, + "loss": 0.1478, + "step": 17497 + }, + { + "epoch": 0.279968, + "grad_norm": 1.015625, + "learning_rate": 7.258548387096775e-05, + "loss": 0.1585, + "step": 17498 + }, + { + "epoch": 0.279984, + "grad_norm": 0.6328125, + "learning_rate": 7.258387096774194e-05, + "loss": 0.1606, + "step": 17499 + }, + { + "epoch": 0.28, + "grad_norm": 0.76171875, + "learning_rate": 7.258225806451613e-05, + "loss": 0.1421, + "step": 17500 + }, + { + "epoch": 0.280016, + "grad_norm": 0.7265625, + "learning_rate": 7.258064516129033e-05, + "loss": 0.1668, + "step": 17501 + }, + { + "epoch": 0.280032, + "grad_norm": 0.69140625, + "learning_rate": 7.257903225806452e-05, + "loss": 0.1744, + "step": 17502 + }, + { + "epoch": 0.280048, + "grad_norm": 0.70703125, + "learning_rate": 7.257741935483872e-05, + "loss": 0.1723, + "step": 17503 + }, + { + "epoch": 0.280064, + "grad_norm": 1.125, + "learning_rate": 7.25758064516129e-05, + "loss": 0.1866, + "step": 17504 + }, + { + "epoch": 0.28008, + "grad_norm": 1.1796875, + "learning_rate": 7.25741935483871e-05, + "loss": 0.1734, + "step": 17505 + }, + { + "epoch": 0.280096, + "grad_norm": 0.7578125, + "learning_rate": 7.257258064516129e-05, + "loss": 0.1622, + "step": 17506 + }, + { + "epoch": 0.280112, + "grad_norm": 0.68359375, + "learning_rate": 7.257096774193549e-05, + "loss": 0.134, + "step": 17507 + }, + { + "epoch": 0.280128, + "grad_norm": 0.80078125, + "learning_rate": 7.256935483870967e-05, + "loss": 0.195, + "step": 17508 + }, + { + "epoch": 0.280144, + "grad_norm": 0.79296875, + "learning_rate": 7.256774193548387e-05, + "loss": 0.1669, + "step": 17509 + }, + { + "epoch": 0.28016, + "grad_norm": 0.703125, + "learning_rate": 7.256612903225807e-05, + "loss": 0.1866, + "step": 17510 + }, + { + "epoch": 0.280176, + "grad_norm": 1.3125, + "learning_rate": 7.256451612903226e-05, + "loss": 0.1365, + "step": 17511 + }, + { + "epoch": 0.280192, + "grad_norm": 0.84765625, + "learning_rate": 7.256290322580646e-05, + "loss": 0.1894, + "step": 17512 + }, + { + "epoch": 0.280208, + "grad_norm": 1.0, + "learning_rate": 7.256129032258064e-05, + "loss": 0.1936, + "step": 17513 + }, + { + "epoch": 0.280224, + "grad_norm": 0.74609375, + "learning_rate": 7.255967741935484e-05, + "loss": 0.1723, + "step": 17514 + }, + { + "epoch": 0.28024, + "grad_norm": 0.9453125, + "learning_rate": 7.255806451612903e-05, + "loss": 0.1502, + "step": 17515 + }, + { + "epoch": 0.280256, + "grad_norm": 0.75, + "learning_rate": 7.255645161290323e-05, + "loss": 0.161, + "step": 17516 + }, + { + "epoch": 0.280272, + "grad_norm": 0.8671875, + "learning_rate": 7.255483870967742e-05, + "loss": 0.1653, + "step": 17517 + }, + { + "epoch": 0.280288, + "grad_norm": 0.72265625, + "learning_rate": 7.255322580645162e-05, + "loss": 0.1894, + "step": 17518 + }, + { + "epoch": 0.280304, + "grad_norm": 1.0546875, + "learning_rate": 7.25516129032258e-05, + "loss": 0.1664, + "step": 17519 + }, + { + "epoch": 0.28032, + "grad_norm": 1.0546875, + "learning_rate": 7.255e-05, + "loss": 0.1439, + "step": 17520 + }, + { + "epoch": 0.280336, + "grad_norm": 0.7578125, + "learning_rate": 7.25483870967742e-05, + "loss": 0.1928, + "step": 17521 + }, + { + "epoch": 0.280352, + "grad_norm": 1.1171875, + "learning_rate": 7.25467741935484e-05, + "loss": 0.1991, + "step": 17522 + }, + { + "epoch": 0.280368, + "grad_norm": 0.6953125, + "learning_rate": 7.254516129032259e-05, + "loss": 0.1816, + "step": 17523 + }, + { + "epoch": 0.280384, + "grad_norm": 0.8359375, + "learning_rate": 7.254354838709679e-05, + "loss": 0.172, + "step": 17524 + }, + { + "epoch": 0.2804, + "grad_norm": 1.265625, + "learning_rate": 7.254193548387097e-05, + "loss": 0.16, + "step": 17525 + }, + { + "epoch": 0.280416, + "grad_norm": 0.9140625, + "learning_rate": 7.254032258064517e-05, + "loss": 0.1711, + "step": 17526 + }, + { + "epoch": 0.280432, + "grad_norm": 0.953125, + "learning_rate": 7.253870967741936e-05, + "loss": 0.1825, + "step": 17527 + }, + { + "epoch": 0.280448, + "grad_norm": 1.328125, + "learning_rate": 7.253709677419354e-05, + "loss": 0.2051, + "step": 17528 + }, + { + "epoch": 0.280464, + "grad_norm": 0.6328125, + "learning_rate": 7.253548387096774e-05, + "loss": 0.1589, + "step": 17529 + }, + { + "epoch": 0.28048, + "grad_norm": 0.54296875, + "learning_rate": 7.253387096774193e-05, + "loss": 0.1676, + "step": 17530 + }, + { + "epoch": 0.280496, + "grad_norm": 0.9765625, + "learning_rate": 7.253225806451613e-05, + "loss": 0.1875, + "step": 17531 + }, + { + "epoch": 0.280512, + "grad_norm": 1.203125, + "learning_rate": 7.253064516129032e-05, + "loss": 0.1577, + "step": 17532 + }, + { + "epoch": 0.280528, + "grad_norm": 0.859375, + "learning_rate": 7.252903225806452e-05, + "loss": 0.2189, + "step": 17533 + }, + { + "epoch": 0.280544, + "grad_norm": 0.9375, + "learning_rate": 7.252741935483872e-05, + "loss": 0.1664, + "step": 17534 + }, + { + "epoch": 0.28056, + "grad_norm": 0.59375, + "learning_rate": 7.252580645161291e-05, + "loss": 0.1797, + "step": 17535 + }, + { + "epoch": 0.280576, + "grad_norm": 0.84375, + "learning_rate": 7.25241935483871e-05, + "loss": 0.1935, + "step": 17536 + }, + { + "epoch": 0.280592, + "grad_norm": 0.80078125, + "learning_rate": 7.25225806451613e-05, + "loss": 0.1637, + "step": 17537 + }, + { + "epoch": 0.280608, + "grad_norm": 0.90625, + "learning_rate": 7.252096774193549e-05, + "loss": 0.1996, + "step": 17538 + }, + { + "epoch": 0.280624, + "grad_norm": 0.5625, + "learning_rate": 7.251935483870969e-05, + "loss": 0.1821, + "step": 17539 + }, + { + "epoch": 0.28064, + "grad_norm": 0.95703125, + "learning_rate": 7.251774193548387e-05, + "loss": 0.1783, + "step": 17540 + }, + { + "epoch": 0.280656, + "grad_norm": 1.1015625, + "learning_rate": 7.251612903225807e-05, + "loss": 0.2041, + "step": 17541 + }, + { + "epoch": 0.280672, + "grad_norm": 0.91796875, + "learning_rate": 7.251451612903226e-05, + "loss": 0.1678, + "step": 17542 + }, + { + "epoch": 0.280688, + "grad_norm": 1.046875, + "learning_rate": 7.251290322580644e-05, + "loss": 0.1892, + "step": 17543 + }, + { + "epoch": 0.280704, + "grad_norm": 0.7734375, + "learning_rate": 7.251129032258064e-05, + "loss": 0.1827, + "step": 17544 + }, + { + "epoch": 0.28072, + "grad_norm": 0.73046875, + "learning_rate": 7.250967741935484e-05, + "loss": 0.1793, + "step": 17545 + }, + { + "epoch": 0.280736, + "grad_norm": 0.75, + "learning_rate": 7.250806451612904e-05, + "loss": 0.1639, + "step": 17546 + }, + { + "epoch": 0.280752, + "grad_norm": 0.734375, + "learning_rate": 7.250645161290323e-05, + "loss": 0.1516, + "step": 17547 + }, + { + "epoch": 0.280768, + "grad_norm": 0.58203125, + "learning_rate": 7.250483870967743e-05, + "loss": 0.1706, + "step": 17548 + }, + { + "epoch": 0.280784, + "grad_norm": 1.0625, + "learning_rate": 7.250322580645161e-05, + "loss": 0.1677, + "step": 17549 + }, + { + "epoch": 0.2808, + "grad_norm": 0.60546875, + "learning_rate": 7.250161290322581e-05, + "loss": 0.1685, + "step": 17550 + }, + { + "epoch": 0.280816, + "grad_norm": 0.8515625, + "learning_rate": 7.25e-05, + "loss": 0.1815, + "step": 17551 + }, + { + "epoch": 0.280832, + "grad_norm": 0.63671875, + "learning_rate": 7.24983870967742e-05, + "loss": 0.1335, + "step": 17552 + }, + { + "epoch": 0.280848, + "grad_norm": 0.82421875, + "learning_rate": 7.249677419354839e-05, + "loss": 0.1715, + "step": 17553 + }, + { + "epoch": 0.280864, + "grad_norm": 0.81640625, + "learning_rate": 7.249516129032259e-05, + "loss": 0.2009, + "step": 17554 + }, + { + "epoch": 0.28088, + "grad_norm": 0.84375, + "learning_rate": 7.249354838709677e-05, + "loss": 0.1351, + "step": 17555 + }, + { + "epoch": 0.280896, + "grad_norm": 0.8203125, + "learning_rate": 7.249193548387097e-05, + "loss": 0.1589, + "step": 17556 + }, + { + "epoch": 0.280912, + "grad_norm": 0.70703125, + "learning_rate": 7.249032258064517e-05, + "loss": 0.1798, + "step": 17557 + }, + { + "epoch": 0.280928, + "grad_norm": 1.109375, + "learning_rate": 7.248870967741936e-05, + "loss": 0.171, + "step": 17558 + }, + { + "epoch": 0.280944, + "grad_norm": 0.5703125, + "learning_rate": 7.248709677419356e-05, + "loss": 0.1477, + "step": 17559 + }, + { + "epoch": 0.28096, + "grad_norm": 0.62890625, + "learning_rate": 7.248548387096774e-05, + "loss": 0.1446, + "step": 17560 + }, + { + "epoch": 0.280976, + "grad_norm": 0.86328125, + "learning_rate": 7.248387096774194e-05, + "loss": 0.1883, + "step": 17561 + }, + { + "epoch": 0.280992, + "grad_norm": 0.455078125, + "learning_rate": 7.248225806451613e-05, + "loss": 0.1437, + "step": 17562 + }, + { + "epoch": 0.281008, + "grad_norm": 0.93359375, + "learning_rate": 7.248064516129033e-05, + "loss": 0.2001, + "step": 17563 + }, + { + "epoch": 0.281024, + "grad_norm": 0.6953125, + "learning_rate": 7.247903225806451e-05, + "loss": 0.1545, + "step": 17564 + }, + { + "epoch": 0.28104, + "grad_norm": 0.6953125, + "learning_rate": 7.247741935483871e-05, + "loss": 0.1611, + "step": 17565 + }, + { + "epoch": 0.281056, + "grad_norm": 0.953125, + "learning_rate": 7.24758064516129e-05, + "loss": 0.1327, + "step": 17566 + }, + { + "epoch": 0.281072, + "grad_norm": 0.58984375, + "learning_rate": 7.24741935483871e-05, + "loss": 0.1615, + "step": 17567 + }, + { + "epoch": 0.281088, + "grad_norm": 0.6640625, + "learning_rate": 7.247258064516129e-05, + "loss": 0.1538, + "step": 17568 + }, + { + "epoch": 0.281104, + "grad_norm": 1.09375, + "learning_rate": 7.247096774193549e-05, + "loss": 0.2148, + "step": 17569 + }, + { + "epoch": 0.28112, + "grad_norm": 0.74609375, + "learning_rate": 7.246935483870968e-05, + "loss": 0.1522, + "step": 17570 + }, + { + "epoch": 0.281136, + "grad_norm": 0.80859375, + "learning_rate": 7.246774193548388e-05, + "loss": 0.1674, + "step": 17571 + }, + { + "epoch": 0.281152, + "grad_norm": 0.671875, + "learning_rate": 7.246612903225807e-05, + "loss": 0.1243, + "step": 17572 + }, + { + "epoch": 0.281168, + "grad_norm": 0.75390625, + "learning_rate": 7.246451612903226e-05, + "loss": 0.1715, + "step": 17573 + }, + { + "epoch": 0.281184, + "grad_norm": 0.72265625, + "learning_rate": 7.246290322580646e-05, + "loss": 0.17, + "step": 17574 + }, + { + "epoch": 0.2812, + "grad_norm": 1.2578125, + "learning_rate": 7.246129032258064e-05, + "loss": 0.1779, + "step": 17575 + }, + { + "epoch": 0.281216, + "grad_norm": 0.59765625, + "learning_rate": 7.245967741935484e-05, + "loss": 0.1507, + "step": 17576 + }, + { + "epoch": 0.281232, + "grad_norm": 1.40625, + "learning_rate": 7.245806451612903e-05, + "loss": 0.184, + "step": 17577 + }, + { + "epoch": 0.281248, + "grad_norm": 0.7421875, + "learning_rate": 7.245645161290323e-05, + "loss": 0.1703, + "step": 17578 + }, + { + "epoch": 0.281264, + "grad_norm": 0.69921875, + "learning_rate": 7.245483870967741e-05, + "loss": 0.1441, + "step": 17579 + }, + { + "epoch": 0.28128, + "grad_norm": 0.75, + "learning_rate": 7.245322580645161e-05, + "loss": 0.1835, + "step": 17580 + }, + { + "epoch": 0.281296, + "grad_norm": 0.69140625, + "learning_rate": 7.245161290322581e-05, + "loss": 0.1482, + "step": 17581 + }, + { + "epoch": 0.281312, + "grad_norm": 0.7890625, + "learning_rate": 7.245000000000001e-05, + "loss": 0.162, + "step": 17582 + }, + { + "epoch": 0.281328, + "grad_norm": 0.9296875, + "learning_rate": 7.24483870967742e-05, + "loss": 0.1979, + "step": 17583 + }, + { + "epoch": 0.281344, + "grad_norm": 1.203125, + "learning_rate": 7.24467741935484e-05, + "loss": 0.1825, + "step": 17584 + }, + { + "epoch": 0.28136, + "grad_norm": 0.953125, + "learning_rate": 7.244516129032258e-05, + "loss": 0.1954, + "step": 17585 + }, + { + "epoch": 0.281376, + "grad_norm": 0.78125, + "learning_rate": 7.244354838709678e-05, + "loss": 0.1896, + "step": 17586 + }, + { + "epoch": 0.281392, + "grad_norm": 0.90234375, + "learning_rate": 7.244193548387097e-05, + "loss": 0.1278, + "step": 17587 + }, + { + "epoch": 0.281408, + "grad_norm": 0.69140625, + "learning_rate": 7.244032258064517e-05, + "loss": 0.153, + "step": 17588 + }, + { + "epoch": 0.281424, + "grad_norm": 0.828125, + "learning_rate": 7.243870967741936e-05, + "loss": 0.179, + "step": 17589 + }, + { + "epoch": 0.28144, + "grad_norm": 1.3515625, + "learning_rate": 7.243709677419354e-05, + "loss": 0.2282, + "step": 17590 + }, + { + "epoch": 0.281456, + "grad_norm": 0.58203125, + "learning_rate": 7.243548387096774e-05, + "loss": 0.1597, + "step": 17591 + }, + { + "epoch": 0.281472, + "grad_norm": 1.046875, + "learning_rate": 7.243387096774194e-05, + "loss": 0.1791, + "step": 17592 + }, + { + "epoch": 0.281488, + "grad_norm": 0.953125, + "learning_rate": 7.243225806451614e-05, + "loss": 0.2075, + "step": 17593 + }, + { + "epoch": 0.281504, + "grad_norm": 0.87890625, + "learning_rate": 7.243064516129033e-05, + "loss": 0.163, + "step": 17594 + }, + { + "epoch": 0.28152, + "grad_norm": 0.8046875, + "learning_rate": 7.242903225806453e-05, + "loss": 0.1231, + "step": 17595 + }, + { + "epoch": 0.281536, + "grad_norm": 0.6171875, + "learning_rate": 7.242741935483871e-05, + "loss": 0.1784, + "step": 17596 + }, + { + "epoch": 0.281552, + "grad_norm": 1.09375, + "learning_rate": 7.242580645161291e-05, + "loss": 0.225, + "step": 17597 + }, + { + "epoch": 0.281568, + "grad_norm": 0.9296875, + "learning_rate": 7.24241935483871e-05, + "loss": 0.1421, + "step": 17598 + }, + { + "epoch": 0.281584, + "grad_norm": 0.83984375, + "learning_rate": 7.24225806451613e-05, + "loss": 0.217, + "step": 17599 + }, + { + "epoch": 0.2816, + "grad_norm": 0.8828125, + "learning_rate": 7.242096774193548e-05, + "loss": 0.188, + "step": 17600 + }, + { + "epoch": 0.281616, + "grad_norm": 0.95703125, + "learning_rate": 7.241935483870968e-05, + "loss": 0.1632, + "step": 17601 + }, + { + "epoch": 0.281632, + "grad_norm": 0.83984375, + "learning_rate": 7.241774193548387e-05, + "loss": 0.2109, + "step": 17602 + }, + { + "epoch": 0.281648, + "grad_norm": 0.91796875, + "learning_rate": 7.241612903225807e-05, + "loss": 0.1541, + "step": 17603 + }, + { + "epoch": 0.281664, + "grad_norm": 0.7734375, + "learning_rate": 7.241451612903226e-05, + "loss": 0.1692, + "step": 17604 + }, + { + "epoch": 0.28168, + "grad_norm": 1.1171875, + "learning_rate": 7.241290322580646e-05, + "loss": 0.2225, + "step": 17605 + }, + { + "epoch": 0.281696, + "grad_norm": 0.65234375, + "learning_rate": 7.241129032258065e-05, + "loss": 0.1452, + "step": 17606 + }, + { + "epoch": 0.281712, + "grad_norm": 0.921875, + "learning_rate": 7.240967741935484e-05, + "loss": 0.1857, + "step": 17607 + }, + { + "epoch": 0.281728, + "grad_norm": 0.69140625, + "learning_rate": 7.240806451612904e-05, + "loss": 0.1676, + "step": 17608 + }, + { + "epoch": 0.281744, + "grad_norm": 0.81640625, + "learning_rate": 7.240645161290323e-05, + "loss": 0.2206, + "step": 17609 + }, + { + "epoch": 0.28176, + "grad_norm": 0.76953125, + "learning_rate": 7.240483870967743e-05, + "loss": 0.1561, + "step": 17610 + }, + { + "epoch": 0.281776, + "grad_norm": 0.93359375, + "learning_rate": 7.240322580645161e-05, + "loss": 0.1693, + "step": 17611 + }, + { + "epoch": 0.281792, + "grad_norm": 0.828125, + "learning_rate": 7.240161290322581e-05, + "loss": 0.1989, + "step": 17612 + }, + { + "epoch": 0.281808, + "grad_norm": 0.7265625, + "learning_rate": 7.24e-05, + "loss": 0.1621, + "step": 17613 + }, + { + "epoch": 0.281824, + "grad_norm": 0.6484375, + "learning_rate": 7.23983870967742e-05, + "loss": 0.2, + "step": 17614 + }, + { + "epoch": 0.28184, + "grad_norm": 0.8359375, + "learning_rate": 7.239677419354838e-05, + "loss": 0.1759, + "step": 17615 + }, + { + "epoch": 0.281856, + "grad_norm": 0.7265625, + "learning_rate": 7.239516129032258e-05, + "loss": 0.1574, + "step": 17616 + }, + { + "epoch": 0.281872, + "grad_norm": 1.1484375, + "learning_rate": 7.239354838709678e-05, + "loss": 0.2171, + "step": 17617 + }, + { + "epoch": 0.281888, + "grad_norm": 1.046875, + "learning_rate": 7.239193548387098e-05, + "loss": 0.16, + "step": 17618 + }, + { + "epoch": 0.281904, + "grad_norm": 0.953125, + "learning_rate": 7.239032258064517e-05, + "loss": 0.196, + "step": 17619 + }, + { + "epoch": 0.28192, + "grad_norm": 0.93359375, + "learning_rate": 7.238870967741935e-05, + "loss": 0.1617, + "step": 17620 + }, + { + "epoch": 0.281936, + "grad_norm": 1.1875, + "learning_rate": 7.238709677419355e-05, + "loss": 0.1843, + "step": 17621 + }, + { + "epoch": 0.281952, + "grad_norm": 0.859375, + "learning_rate": 7.238548387096774e-05, + "loss": 0.1637, + "step": 17622 + }, + { + "epoch": 0.281968, + "grad_norm": 1.1640625, + "learning_rate": 7.238387096774194e-05, + "loss": 0.2031, + "step": 17623 + }, + { + "epoch": 0.281984, + "grad_norm": 0.6875, + "learning_rate": 7.238225806451613e-05, + "loss": 0.1677, + "step": 17624 + }, + { + "epoch": 0.282, + "grad_norm": 0.96484375, + "learning_rate": 7.238064516129033e-05, + "loss": 0.1584, + "step": 17625 + }, + { + "epoch": 0.282016, + "grad_norm": 0.61328125, + "learning_rate": 7.237903225806451e-05, + "loss": 0.1763, + "step": 17626 + }, + { + "epoch": 0.282032, + "grad_norm": 1.1171875, + "learning_rate": 7.237741935483871e-05, + "loss": 0.1926, + "step": 17627 + }, + { + "epoch": 0.282048, + "grad_norm": 0.58984375, + "learning_rate": 7.23758064516129e-05, + "loss": 0.1859, + "step": 17628 + }, + { + "epoch": 0.282064, + "grad_norm": 0.9921875, + "learning_rate": 7.23741935483871e-05, + "loss": 0.1983, + "step": 17629 + }, + { + "epoch": 0.28208, + "grad_norm": 0.77734375, + "learning_rate": 7.23725806451613e-05, + "loss": 0.1801, + "step": 17630 + }, + { + "epoch": 0.282096, + "grad_norm": 0.671875, + "learning_rate": 7.23709677419355e-05, + "loss": 0.1513, + "step": 17631 + }, + { + "epoch": 0.282112, + "grad_norm": 1.25, + "learning_rate": 7.236935483870968e-05, + "loss": 0.1515, + "step": 17632 + }, + { + "epoch": 0.282128, + "grad_norm": 0.828125, + "learning_rate": 7.236774193548388e-05, + "loss": 0.2019, + "step": 17633 + }, + { + "epoch": 0.282144, + "grad_norm": 1.0234375, + "learning_rate": 7.236612903225807e-05, + "loss": 0.1728, + "step": 17634 + }, + { + "epoch": 0.28216, + "grad_norm": 0.71484375, + "learning_rate": 7.236451612903227e-05, + "loss": 0.1482, + "step": 17635 + }, + { + "epoch": 0.282176, + "grad_norm": 0.59375, + "learning_rate": 7.236290322580645e-05, + "loss": 0.1577, + "step": 17636 + }, + { + "epoch": 0.282192, + "grad_norm": 0.609375, + "learning_rate": 7.236129032258064e-05, + "loss": 0.1413, + "step": 17637 + }, + { + "epoch": 0.282208, + "grad_norm": 1.09375, + "learning_rate": 7.235967741935484e-05, + "loss": 0.1738, + "step": 17638 + }, + { + "epoch": 0.282224, + "grad_norm": 0.62109375, + "learning_rate": 7.235806451612903e-05, + "loss": 0.1643, + "step": 17639 + }, + { + "epoch": 0.28224, + "grad_norm": 0.625, + "learning_rate": 7.235645161290323e-05, + "loss": 0.1678, + "step": 17640 + }, + { + "epoch": 0.282256, + "grad_norm": 0.71875, + "learning_rate": 7.235483870967742e-05, + "loss": 0.1652, + "step": 17641 + }, + { + "epoch": 0.282272, + "grad_norm": 0.76171875, + "learning_rate": 7.235322580645162e-05, + "loss": 0.1698, + "step": 17642 + }, + { + "epoch": 0.282288, + "grad_norm": 0.59765625, + "learning_rate": 7.235161290322581e-05, + "loss": 0.1506, + "step": 17643 + }, + { + "epoch": 0.282304, + "grad_norm": 1.1328125, + "learning_rate": 7.235000000000001e-05, + "loss": 0.1607, + "step": 17644 + }, + { + "epoch": 0.28232, + "grad_norm": 0.64453125, + "learning_rate": 7.23483870967742e-05, + "loss": 0.1763, + "step": 17645 + }, + { + "epoch": 0.282336, + "grad_norm": 0.72265625, + "learning_rate": 7.23467741935484e-05, + "loss": 0.1392, + "step": 17646 + }, + { + "epoch": 0.282352, + "grad_norm": 0.8828125, + "learning_rate": 7.234516129032258e-05, + "loss": 0.1819, + "step": 17647 + }, + { + "epoch": 0.282368, + "grad_norm": 0.81640625, + "learning_rate": 7.234354838709678e-05, + "loss": 0.1926, + "step": 17648 + }, + { + "epoch": 0.282384, + "grad_norm": 1.0703125, + "learning_rate": 7.234193548387097e-05, + "loss": 0.167, + "step": 17649 + }, + { + "epoch": 0.2824, + "grad_norm": 0.83203125, + "learning_rate": 7.234032258064517e-05, + "loss": 0.2008, + "step": 17650 + }, + { + "epoch": 0.282416, + "grad_norm": 0.77734375, + "learning_rate": 7.233870967741935e-05, + "loss": 0.1775, + "step": 17651 + }, + { + "epoch": 0.282432, + "grad_norm": 0.7109375, + "learning_rate": 7.233709677419355e-05, + "loss": 0.1523, + "step": 17652 + }, + { + "epoch": 0.282448, + "grad_norm": 0.76171875, + "learning_rate": 7.233548387096775e-05, + "loss": 0.1872, + "step": 17653 + }, + { + "epoch": 0.282464, + "grad_norm": 0.859375, + "learning_rate": 7.233387096774194e-05, + "loss": 0.1459, + "step": 17654 + }, + { + "epoch": 0.28248, + "grad_norm": 0.75, + "learning_rate": 7.233225806451614e-05, + "loss": 0.1546, + "step": 17655 + }, + { + "epoch": 0.282496, + "grad_norm": 0.58203125, + "learning_rate": 7.233064516129032e-05, + "loss": 0.1607, + "step": 17656 + }, + { + "epoch": 0.282512, + "grad_norm": 0.77734375, + "learning_rate": 7.232903225806452e-05, + "loss": 0.1766, + "step": 17657 + }, + { + "epoch": 0.282528, + "grad_norm": 0.890625, + "learning_rate": 7.232741935483871e-05, + "loss": 0.1626, + "step": 17658 + }, + { + "epoch": 0.282544, + "grad_norm": 0.57421875, + "learning_rate": 7.232580645161291e-05, + "loss": 0.154, + "step": 17659 + }, + { + "epoch": 0.28256, + "grad_norm": 0.419921875, + "learning_rate": 7.23241935483871e-05, + "loss": 0.1452, + "step": 17660 + }, + { + "epoch": 0.282576, + "grad_norm": 1.1796875, + "learning_rate": 7.23225806451613e-05, + "loss": 0.1693, + "step": 17661 + }, + { + "epoch": 0.282592, + "grad_norm": 1.234375, + "learning_rate": 7.232096774193548e-05, + "loss": 0.1787, + "step": 17662 + }, + { + "epoch": 0.282608, + "grad_norm": 1.375, + "learning_rate": 7.231935483870968e-05, + "loss": 0.2303, + "step": 17663 + }, + { + "epoch": 0.282624, + "grad_norm": 0.703125, + "learning_rate": 7.231774193548387e-05, + "loss": 0.1489, + "step": 17664 + }, + { + "epoch": 0.28264, + "grad_norm": 0.6875, + "learning_rate": 7.231612903225807e-05, + "loss": 0.1536, + "step": 17665 + }, + { + "epoch": 0.282656, + "grad_norm": 0.8359375, + "learning_rate": 7.231451612903227e-05, + "loss": 0.1856, + "step": 17666 + }, + { + "epoch": 0.282672, + "grad_norm": 0.6171875, + "learning_rate": 7.231290322580645e-05, + "loss": 0.1512, + "step": 17667 + }, + { + "epoch": 0.282688, + "grad_norm": 0.94140625, + "learning_rate": 7.231129032258065e-05, + "loss": 0.1723, + "step": 17668 + }, + { + "epoch": 0.282704, + "grad_norm": 0.96484375, + "learning_rate": 7.230967741935484e-05, + "loss": 0.1849, + "step": 17669 + }, + { + "epoch": 0.28272, + "grad_norm": 0.76953125, + "learning_rate": 7.230806451612904e-05, + "loss": 0.1995, + "step": 17670 + }, + { + "epoch": 0.282736, + "grad_norm": 0.73046875, + "learning_rate": 7.230645161290322e-05, + "loss": 0.1639, + "step": 17671 + }, + { + "epoch": 0.282752, + "grad_norm": 1.125, + "learning_rate": 7.230483870967742e-05, + "loss": 0.1833, + "step": 17672 + }, + { + "epoch": 0.282768, + "grad_norm": 0.8515625, + "learning_rate": 7.230322580645161e-05, + "loss": 0.1854, + "step": 17673 + }, + { + "epoch": 0.282784, + "grad_norm": 0.89453125, + "learning_rate": 7.230161290322581e-05, + "loss": 0.2002, + "step": 17674 + }, + { + "epoch": 0.2828, + "grad_norm": 0.921875, + "learning_rate": 7.23e-05, + "loss": 0.1765, + "step": 17675 + }, + { + "epoch": 0.282816, + "grad_norm": 0.83984375, + "learning_rate": 7.22983870967742e-05, + "loss": 0.1843, + "step": 17676 + }, + { + "epoch": 0.282832, + "grad_norm": 0.859375, + "learning_rate": 7.22967741935484e-05, + "loss": 0.1797, + "step": 17677 + }, + { + "epoch": 0.282848, + "grad_norm": 0.84375, + "learning_rate": 7.22951612903226e-05, + "loss": 0.144, + "step": 17678 + }, + { + "epoch": 0.282864, + "grad_norm": 0.9765625, + "learning_rate": 7.229354838709678e-05, + "loss": 0.1504, + "step": 17679 + }, + { + "epoch": 0.28288, + "grad_norm": 0.6171875, + "learning_rate": 7.229193548387098e-05, + "loss": 0.1685, + "step": 17680 + }, + { + "epoch": 0.282896, + "grad_norm": 0.62890625, + "learning_rate": 7.229032258064517e-05, + "loss": 0.1695, + "step": 17681 + }, + { + "epoch": 0.282912, + "grad_norm": 1.7109375, + "learning_rate": 7.228870967741935e-05, + "loss": 0.2144, + "step": 17682 + }, + { + "epoch": 0.282928, + "grad_norm": 0.67578125, + "learning_rate": 7.228709677419355e-05, + "loss": 0.1793, + "step": 17683 + }, + { + "epoch": 0.282944, + "grad_norm": 0.890625, + "learning_rate": 7.228548387096774e-05, + "loss": 0.1875, + "step": 17684 + }, + { + "epoch": 0.28296, + "grad_norm": 0.77734375, + "learning_rate": 7.228387096774194e-05, + "loss": 0.1222, + "step": 17685 + }, + { + "epoch": 0.282976, + "grad_norm": 0.859375, + "learning_rate": 7.228225806451612e-05, + "loss": 0.1565, + "step": 17686 + }, + { + "epoch": 0.282992, + "grad_norm": 0.6796875, + "learning_rate": 7.228064516129032e-05, + "loss": 0.1723, + "step": 17687 + }, + { + "epoch": 0.283008, + "grad_norm": 0.74609375, + "learning_rate": 7.227903225806452e-05, + "loss": 0.1591, + "step": 17688 + }, + { + "epoch": 0.283024, + "grad_norm": 0.9296875, + "learning_rate": 7.227741935483871e-05, + "loss": 0.2051, + "step": 17689 + }, + { + "epoch": 0.28304, + "grad_norm": 0.625, + "learning_rate": 7.227580645161291e-05, + "loss": 0.1406, + "step": 17690 + }, + { + "epoch": 0.283056, + "grad_norm": 0.75390625, + "learning_rate": 7.227419354838711e-05, + "loss": 0.1869, + "step": 17691 + }, + { + "epoch": 0.283072, + "grad_norm": 1.1484375, + "learning_rate": 7.22725806451613e-05, + "loss": 0.1774, + "step": 17692 + }, + { + "epoch": 0.283088, + "grad_norm": 0.47265625, + "learning_rate": 7.22709677419355e-05, + "loss": 0.137, + "step": 17693 + }, + { + "epoch": 0.283104, + "grad_norm": 1.21875, + "learning_rate": 7.226935483870968e-05, + "loss": 0.1465, + "step": 17694 + }, + { + "epoch": 0.28312, + "grad_norm": 1.84375, + "learning_rate": 7.226774193548388e-05, + "loss": 0.1688, + "step": 17695 + }, + { + "epoch": 0.283136, + "grad_norm": 0.7421875, + "learning_rate": 7.226612903225807e-05, + "loss": 0.1504, + "step": 17696 + }, + { + "epoch": 0.283152, + "grad_norm": 1.2578125, + "learning_rate": 7.226451612903227e-05, + "loss": 0.1735, + "step": 17697 + }, + { + "epoch": 0.283168, + "grad_norm": 0.77734375, + "learning_rate": 7.226290322580645e-05, + "loss": 0.1271, + "step": 17698 + }, + { + "epoch": 0.283184, + "grad_norm": 0.97265625, + "learning_rate": 7.226129032258064e-05, + "loss": 0.1265, + "step": 17699 + }, + { + "epoch": 0.2832, + "grad_norm": 0.61328125, + "learning_rate": 7.225967741935484e-05, + "loss": 0.1403, + "step": 17700 + }, + { + "epoch": 0.283216, + "grad_norm": 0.80859375, + "learning_rate": 7.225806451612904e-05, + "loss": 0.1941, + "step": 17701 + }, + { + "epoch": 0.283232, + "grad_norm": 0.8203125, + "learning_rate": 7.225645161290324e-05, + "loss": 0.1942, + "step": 17702 + }, + { + "epoch": 0.283248, + "grad_norm": 0.921875, + "learning_rate": 7.225483870967742e-05, + "loss": 0.1799, + "step": 17703 + }, + { + "epoch": 0.283264, + "grad_norm": 0.77734375, + "learning_rate": 7.225322580645162e-05, + "loss": 0.1957, + "step": 17704 + }, + { + "epoch": 0.28328, + "grad_norm": 1.21875, + "learning_rate": 7.225161290322581e-05, + "loss": 0.2172, + "step": 17705 + }, + { + "epoch": 0.283296, + "grad_norm": 0.71875, + "learning_rate": 7.225000000000001e-05, + "loss": 0.1957, + "step": 17706 + }, + { + "epoch": 0.283312, + "grad_norm": 0.59375, + "learning_rate": 7.22483870967742e-05, + "loss": 0.1567, + "step": 17707 + }, + { + "epoch": 0.283328, + "grad_norm": 0.62109375, + "learning_rate": 7.22467741935484e-05, + "loss": 0.1382, + "step": 17708 + }, + { + "epoch": 0.283344, + "grad_norm": 0.84765625, + "learning_rate": 7.224516129032258e-05, + "loss": 0.1126, + "step": 17709 + }, + { + "epoch": 0.28336, + "grad_norm": 0.8359375, + "learning_rate": 7.224354838709678e-05, + "loss": 0.1609, + "step": 17710 + }, + { + "epoch": 0.283376, + "grad_norm": 0.57421875, + "learning_rate": 7.224193548387097e-05, + "loss": 0.1686, + "step": 17711 + }, + { + "epoch": 0.283392, + "grad_norm": 0.796875, + "learning_rate": 7.224032258064517e-05, + "loss": 0.1722, + "step": 17712 + }, + { + "epoch": 0.283408, + "grad_norm": 0.72265625, + "learning_rate": 7.223870967741936e-05, + "loss": 0.1874, + "step": 17713 + }, + { + "epoch": 0.283424, + "grad_norm": 1.6640625, + "learning_rate": 7.223709677419355e-05, + "loss": 0.1898, + "step": 17714 + }, + { + "epoch": 0.28344, + "grad_norm": 0.54296875, + "learning_rate": 7.223548387096775e-05, + "loss": 0.1181, + "step": 17715 + }, + { + "epoch": 0.283456, + "grad_norm": 0.640625, + "learning_rate": 7.223387096774194e-05, + "loss": 0.1499, + "step": 17716 + }, + { + "epoch": 0.283472, + "grad_norm": 1.015625, + "learning_rate": 7.223225806451614e-05, + "loss": 0.1888, + "step": 17717 + }, + { + "epoch": 0.283488, + "grad_norm": 1.265625, + "learning_rate": 7.223064516129032e-05, + "loss": 0.1877, + "step": 17718 + }, + { + "epoch": 0.283504, + "grad_norm": 0.546875, + "learning_rate": 7.222903225806452e-05, + "loss": 0.1545, + "step": 17719 + }, + { + "epoch": 0.28352, + "grad_norm": 1.015625, + "learning_rate": 7.222741935483871e-05, + "loss": 0.1852, + "step": 17720 + }, + { + "epoch": 0.283536, + "grad_norm": 1.4765625, + "learning_rate": 7.222580645161291e-05, + "loss": 0.1481, + "step": 17721 + }, + { + "epoch": 0.283552, + "grad_norm": 0.7734375, + "learning_rate": 7.22241935483871e-05, + "loss": 0.1526, + "step": 17722 + }, + { + "epoch": 0.283568, + "grad_norm": 0.8671875, + "learning_rate": 7.222258064516129e-05, + "loss": 0.1809, + "step": 17723 + }, + { + "epoch": 0.283584, + "grad_norm": 0.82421875, + "learning_rate": 7.222096774193548e-05, + "loss": 0.1436, + "step": 17724 + }, + { + "epoch": 0.2836, + "grad_norm": 0.9921875, + "learning_rate": 7.221935483870968e-05, + "loss": 0.2084, + "step": 17725 + }, + { + "epoch": 0.283616, + "grad_norm": 1.265625, + "learning_rate": 7.221774193548388e-05, + "loss": 0.1543, + "step": 17726 + }, + { + "epoch": 0.283632, + "grad_norm": 1.09375, + "learning_rate": 7.221612903225808e-05, + "loss": 0.1885, + "step": 17727 + }, + { + "epoch": 0.283648, + "grad_norm": 0.6015625, + "learning_rate": 7.221451612903226e-05, + "loss": 0.1525, + "step": 17728 + }, + { + "epoch": 0.283664, + "grad_norm": 0.6328125, + "learning_rate": 7.221290322580645e-05, + "loss": 0.1497, + "step": 17729 + }, + { + "epoch": 0.28368, + "grad_norm": 0.8359375, + "learning_rate": 7.221129032258065e-05, + "loss": 0.1835, + "step": 17730 + }, + { + "epoch": 0.283696, + "grad_norm": 0.8359375, + "learning_rate": 7.220967741935484e-05, + "loss": 0.1221, + "step": 17731 + }, + { + "epoch": 0.283712, + "grad_norm": 0.70703125, + "learning_rate": 7.220806451612904e-05, + "loss": 0.1742, + "step": 17732 + }, + { + "epoch": 0.283728, + "grad_norm": 0.8125, + "learning_rate": 7.220645161290322e-05, + "loss": 0.2072, + "step": 17733 + }, + { + "epoch": 0.283744, + "grad_norm": 0.68359375, + "learning_rate": 7.220483870967742e-05, + "loss": 0.1824, + "step": 17734 + }, + { + "epoch": 0.28376, + "grad_norm": 0.62890625, + "learning_rate": 7.220322580645161e-05, + "loss": 0.1804, + "step": 17735 + }, + { + "epoch": 0.283776, + "grad_norm": 0.94140625, + "learning_rate": 7.220161290322581e-05, + "loss": 0.2154, + "step": 17736 + }, + { + "epoch": 0.283792, + "grad_norm": 0.8359375, + "learning_rate": 7.22e-05, + "loss": 0.1835, + "step": 17737 + }, + { + "epoch": 0.283808, + "grad_norm": 0.88671875, + "learning_rate": 7.21983870967742e-05, + "loss": 0.1469, + "step": 17738 + }, + { + "epoch": 0.283824, + "grad_norm": 0.80859375, + "learning_rate": 7.219677419354839e-05, + "loss": 0.1736, + "step": 17739 + }, + { + "epoch": 0.28384, + "grad_norm": 0.7421875, + "learning_rate": 7.219516129032259e-05, + "loss": 0.1156, + "step": 17740 + }, + { + "epoch": 0.283856, + "grad_norm": 0.90234375, + "learning_rate": 7.219354838709678e-05, + "loss": 0.1891, + "step": 17741 + }, + { + "epoch": 0.283872, + "grad_norm": 0.50390625, + "learning_rate": 7.219193548387098e-05, + "loss": 0.165, + "step": 17742 + }, + { + "epoch": 0.283888, + "grad_norm": 0.79296875, + "learning_rate": 7.219032258064516e-05, + "loss": 0.1763, + "step": 17743 + }, + { + "epoch": 0.283904, + "grad_norm": 0.64453125, + "learning_rate": 7.218870967741935e-05, + "loss": 0.1439, + "step": 17744 + }, + { + "epoch": 0.28392, + "grad_norm": 1.34375, + "learning_rate": 7.218709677419355e-05, + "loss": 0.173, + "step": 17745 + }, + { + "epoch": 0.283936, + "grad_norm": 1.0078125, + "learning_rate": 7.218548387096774e-05, + "loss": 0.1814, + "step": 17746 + }, + { + "epoch": 0.283952, + "grad_norm": 0.78515625, + "learning_rate": 7.218387096774194e-05, + "loss": 0.1482, + "step": 17747 + }, + { + "epoch": 0.283968, + "grad_norm": 0.8671875, + "learning_rate": 7.218225806451613e-05, + "loss": 0.1666, + "step": 17748 + }, + { + "epoch": 0.283984, + "grad_norm": 0.9140625, + "learning_rate": 7.218064516129033e-05, + "loss": 0.2102, + "step": 17749 + }, + { + "epoch": 0.284, + "grad_norm": 0.80078125, + "learning_rate": 7.217903225806452e-05, + "loss": 0.1965, + "step": 17750 + }, + { + "epoch": 0.284016, + "grad_norm": 0.99609375, + "learning_rate": 7.217741935483872e-05, + "loss": 0.1439, + "step": 17751 + }, + { + "epoch": 0.284032, + "grad_norm": 0.81640625, + "learning_rate": 7.21758064516129e-05, + "loss": 0.2127, + "step": 17752 + }, + { + "epoch": 0.284048, + "grad_norm": 1.15625, + "learning_rate": 7.21741935483871e-05, + "loss": 0.1862, + "step": 17753 + }, + { + "epoch": 0.284064, + "grad_norm": 0.87890625, + "learning_rate": 7.217258064516129e-05, + "loss": 0.1638, + "step": 17754 + }, + { + "epoch": 0.28408, + "grad_norm": 0.55859375, + "learning_rate": 7.217096774193549e-05, + "loss": 0.1483, + "step": 17755 + }, + { + "epoch": 0.284096, + "grad_norm": 0.859375, + "learning_rate": 7.216935483870968e-05, + "loss": 0.1637, + "step": 17756 + }, + { + "epoch": 0.284112, + "grad_norm": 0.9296875, + "learning_rate": 7.216774193548388e-05, + "loss": 0.1564, + "step": 17757 + }, + { + "epoch": 0.284128, + "grad_norm": 0.62109375, + "learning_rate": 7.216612903225806e-05, + "loss": 0.138, + "step": 17758 + }, + { + "epoch": 0.284144, + "grad_norm": 0.71875, + "learning_rate": 7.216451612903226e-05, + "loss": 0.1514, + "step": 17759 + }, + { + "epoch": 0.28416, + "grad_norm": 0.87109375, + "learning_rate": 7.216290322580645e-05, + "loss": 0.1806, + "step": 17760 + }, + { + "epoch": 0.284176, + "grad_norm": 0.62890625, + "learning_rate": 7.216129032258065e-05, + "loss": 0.1689, + "step": 17761 + }, + { + "epoch": 0.284192, + "grad_norm": 0.55859375, + "learning_rate": 7.215967741935485e-05, + "loss": 0.1833, + "step": 17762 + }, + { + "epoch": 0.284208, + "grad_norm": 0.8359375, + "learning_rate": 7.215806451612903e-05, + "loss": 0.1562, + "step": 17763 + }, + { + "epoch": 0.284224, + "grad_norm": 1.0390625, + "learning_rate": 7.215645161290323e-05, + "loss": 0.1724, + "step": 17764 + }, + { + "epoch": 0.28424, + "grad_norm": 1.34375, + "learning_rate": 7.215483870967742e-05, + "loss": 0.1827, + "step": 17765 + }, + { + "epoch": 0.284256, + "grad_norm": 0.72265625, + "learning_rate": 7.215322580645162e-05, + "loss": 0.1552, + "step": 17766 + }, + { + "epoch": 0.284272, + "grad_norm": 1.375, + "learning_rate": 7.21516129032258e-05, + "loss": 0.153, + "step": 17767 + }, + { + "epoch": 0.284288, + "grad_norm": 0.8046875, + "learning_rate": 7.215e-05, + "loss": 0.1944, + "step": 17768 + }, + { + "epoch": 0.284304, + "grad_norm": 0.96875, + "learning_rate": 7.214838709677419e-05, + "loss": 0.1994, + "step": 17769 + }, + { + "epoch": 0.28432, + "grad_norm": 0.734375, + "learning_rate": 7.214677419354839e-05, + "loss": 0.1969, + "step": 17770 + }, + { + "epoch": 0.284336, + "grad_norm": 0.9453125, + "learning_rate": 7.214516129032258e-05, + "loss": 0.2373, + "step": 17771 + }, + { + "epoch": 0.284352, + "grad_norm": 0.87890625, + "learning_rate": 7.214354838709678e-05, + "loss": 0.1652, + "step": 17772 + }, + { + "epoch": 0.284368, + "grad_norm": 0.62109375, + "learning_rate": 7.214193548387098e-05, + "loss": 0.1626, + "step": 17773 + }, + { + "epoch": 0.284384, + "grad_norm": 1.0625, + "learning_rate": 7.214032258064518e-05, + "loss": 0.1795, + "step": 17774 + }, + { + "epoch": 0.2844, + "grad_norm": 0.83984375, + "learning_rate": 7.213870967741936e-05, + "loss": 0.1625, + "step": 17775 + }, + { + "epoch": 0.284416, + "grad_norm": 0.76953125, + "learning_rate": 7.213709677419355e-05, + "loss": 0.1715, + "step": 17776 + }, + { + "epoch": 0.284432, + "grad_norm": 0.7578125, + "learning_rate": 7.213548387096775e-05, + "loss": 0.1522, + "step": 17777 + }, + { + "epoch": 0.284448, + "grad_norm": 0.64453125, + "learning_rate": 7.213387096774193e-05, + "loss": 0.174, + "step": 17778 + }, + { + "epoch": 0.284464, + "grad_norm": 1.1484375, + "learning_rate": 7.213225806451613e-05, + "loss": 0.178, + "step": 17779 + }, + { + "epoch": 0.28448, + "grad_norm": 0.921875, + "learning_rate": 7.213064516129032e-05, + "loss": 0.1857, + "step": 17780 + }, + { + "epoch": 0.284496, + "grad_norm": 0.78515625, + "learning_rate": 7.212903225806452e-05, + "loss": 0.1804, + "step": 17781 + }, + { + "epoch": 0.284512, + "grad_norm": 1.140625, + "learning_rate": 7.21274193548387e-05, + "loss": 0.1944, + "step": 17782 + }, + { + "epoch": 0.284528, + "grad_norm": 0.67578125, + "learning_rate": 7.21258064516129e-05, + "loss": 0.1724, + "step": 17783 + }, + { + "epoch": 0.284544, + "grad_norm": 1.0, + "learning_rate": 7.212419354838709e-05, + "loss": 0.2168, + "step": 17784 + }, + { + "epoch": 0.28456, + "grad_norm": 0.75, + "learning_rate": 7.212258064516129e-05, + "loss": 0.2059, + "step": 17785 + }, + { + "epoch": 0.284576, + "grad_norm": 0.76953125, + "learning_rate": 7.212096774193549e-05, + "loss": 0.1572, + "step": 17786 + }, + { + "epoch": 0.284592, + "grad_norm": 1.2109375, + "learning_rate": 7.211935483870969e-05, + "loss": 0.1547, + "step": 17787 + }, + { + "epoch": 0.284608, + "grad_norm": 0.875, + "learning_rate": 7.211774193548388e-05, + "loss": 0.1626, + "step": 17788 + }, + { + "epoch": 0.284624, + "grad_norm": 0.97265625, + "learning_rate": 7.211612903225808e-05, + "loss": 0.1741, + "step": 17789 + }, + { + "epoch": 0.28464, + "grad_norm": 0.5234375, + "learning_rate": 7.211451612903226e-05, + "loss": 0.1471, + "step": 17790 + }, + { + "epoch": 0.284656, + "grad_norm": 0.734375, + "learning_rate": 7.211290322580645e-05, + "loss": 0.1675, + "step": 17791 + }, + { + "epoch": 0.284672, + "grad_norm": 0.59375, + "learning_rate": 7.211129032258065e-05, + "loss": 0.1355, + "step": 17792 + }, + { + "epoch": 0.284688, + "grad_norm": 0.8125, + "learning_rate": 7.210967741935483e-05, + "loss": 0.172, + "step": 17793 + }, + { + "epoch": 0.284704, + "grad_norm": 1.1875, + "learning_rate": 7.210806451612903e-05, + "loss": 0.1667, + "step": 17794 + }, + { + "epoch": 0.28472, + "grad_norm": 0.5390625, + "learning_rate": 7.210645161290322e-05, + "loss": 0.1294, + "step": 17795 + }, + { + "epoch": 0.284736, + "grad_norm": 0.82421875, + "learning_rate": 7.210483870967742e-05, + "loss": 0.1767, + "step": 17796 + }, + { + "epoch": 0.284752, + "grad_norm": 0.7578125, + "learning_rate": 7.210322580645162e-05, + "loss": 0.166, + "step": 17797 + }, + { + "epoch": 0.284768, + "grad_norm": 0.5234375, + "learning_rate": 7.210161290322582e-05, + "loss": 0.1725, + "step": 17798 + }, + { + "epoch": 0.284784, + "grad_norm": 1.1015625, + "learning_rate": 7.21e-05, + "loss": 0.1704, + "step": 17799 + }, + { + "epoch": 0.2848, + "grad_norm": 1.0234375, + "learning_rate": 7.20983870967742e-05, + "loss": 0.1157, + "step": 17800 + }, + { + "epoch": 0.284816, + "grad_norm": 0.8359375, + "learning_rate": 7.209677419354839e-05, + "loss": 0.18, + "step": 17801 + }, + { + "epoch": 0.284832, + "grad_norm": 0.5, + "learning_rate": 7.209516129032259e-05, + "loss": 0.1689, + "step": 17802 + }, + { + "epoch": 0.284848, + "grad_norm": 0.45703125, + "learning_rate": 7.209354838709678e-05, + "loss": 0.1498, + "step": 17803 + }, + { + "epoch": 0.284864, + "grad_norm": 0.89453125, + "learning_rate": 7.209193548387098e-05, + "loss": 0.1938, + "step": 17804 + }, + { + "epoch": 0.28488, + "grad_norm": 0.83984375, + "learning_rate": 7.209032258064516e-05, + "loss": 0.1858, + "step": 17805 + }, + { + "epoch": 0.284896, + "grad_norm": 0.94921875, + "learning_rate": 7.208870967741936e-05, + "loss": 0.1509, + "step": 17806 + }, + { + "epoch": 0.284912, + "grad_norm": 0.486328125, + "learning_rate": 7.208709677419355e-05, + "loss": 0.1531, + "step": 17807 + }, + { + "epoch": 0.284928, + "grad_norm": 0.56640625, + "learning_rate": 7.208548387096775e-05, + "loss": 0.1582, + "step": 17808 + }, + { + "epoch": 0.284944, + "grad_norm": 0.64453125, + "learning_rate": 7.208387096774195e-05, + "loss": 0.1434, + "step": 17809 + }, + { + "epoch": 0.28496, + "grad_norm": 0.60546875, + "learning_rate": 7.208225806451613e-05, + "loss": 0.1699, + "step": 17810 + }, + { + "epoch": 0.284976, + "grad_norm": 0.7109375, + "learning_rate": 7.208064516129033e-05, + "loss": 0.1486, + "step": 17811 + }, + { + "epoch": 0.284992, + "grad_norm": 0.8984375, + "learning_rate": 7.207903225806452e-05, + "loss": 0.1759, + "step": 17812 + }, + { + "epoch": 0.285008, + "grad_norm": 0.71875, + "learning_rate": 7.207741935483872e-05, + "loss": 0.1725, + "step": 17813 + }, + { + "epoch": 0.285024, + "grad_norm": 0.62109375, + "learning_rate": 7.20758064516129e-05, + "loss": 0.1669, + "step": 17814 + }, + { + "epoch": 0.28504, + "grad_norm": 0.7890625, + "learning_rate": 7.20741935483871e-05, + "loss": 0.199, + "step": 17815 + }, + { + "epoch": 0.285056, + "grad_norm": 1.0078125, + "learning_rate": 7.207258064516129e-05, + "loss": 0.1446, + "step": 17816 + }, + { + "epoch": 0.285072, + "grad_norm": 0.85546875, + "learning_rate": 7.207096774193549e-05, + "loss": 0.1519, + "step": 17817 + }, + { + "epoch": 0.285088, + "grad_norm": 0.65625, + "learning_rate": 7.206935483870968e-05, + "loss": 0.1661, + "step": 17818 + }, + { + "epoch": 0.285104, + "grad_norm": 1.3984375, + "learning_rate": 7.206774193548387e-05, + "loss": 0.165, + "step": 17819 + }, + { + "epoch": 0.28512, + "grad_norm": 0.58984375, + "learning_rate": 7.206612903225806e-05, + "loss": 0.1565, + "step": 17820 + }, + { + "epoch": 0.285136, + "grad_norm": 0.80078125, + "learning_rate": 7.206451612903226e-05, + "loss": 0.1393, + "step": 17821 + }, + { + "epoch": 0.285152, + "grad_norm": 0.5859375, + "learning_rate": 7.206290322580646e-05, + "loss": 0.1572, + "step": 17822 + }, + { + "epoch": 0.285168, + "grad_norm": 1.328125, + "learning_rate": 7.206129032258065e-05, + "loss": 0.1355, + "step": 17823 + }, + { + "epoch": 0.285184, + "grad_norm": 0.88671875, + "learning_rate": 7.205967741935485e-05, + "loss": 0.1675, + "step": 17824 + }, + { + "epoch": 0.2852, + "grad_norm": 0.875, + "learning_rate": 7.205806451612903e-05, + "loss": 0.1383, + "step": 17825 + }, + { + "epoch": 0.285216, + "grad_norm": 0.76171875, + "learning_rate": 7.205645161290323e-05, + "loss": 0.1686, + "step": 17826 + }, + { + "epoch": 0.285232, + "grad_norm": 1.25, + "learning_rate": 7.205483870967742e-05, + "loss": 0.1708, + "step": 17827 + }, + { + "epoch": 0.285248, + "grad_norm": 0.71484375, + "learning_rate": 7.205322580645162e-05, + "loss": 0.1901, + "step": 17828 + }, + { + "epoch": 0.285264, + "grad_norm": 0.71484375, + "learning_rate": 7.20516129032258e-05, + "loss": 0.1761, + "step": 17829 + }, + { + "epoch": 0.28528, + "grad_norm": 1.140625, + "learning_rate": 7.205e-05, + "loss": 0.1711, + "step": 17830 + }, + { + "epoch": 0.285296, + "grad_norm": 0.6875, + "learning_rate": 7.204838709677419e-05, + "loss": 0.1572, + "step": 17831 + }, + { + "epoch": 0.285312, + "grad_norm": 0.87890625, + "learning_rate": 7.204677419354839e-05, + "loss": 0.2092, + "step": 17832 + }, + { + "epoch": 0.285328, + "grad_norm": 0.6796875, + "learning_rate": 7.204516129032259e-05, + "loss": 0.1682, + "step": 17833 + }, + { + "epoch": 0.285344, + "grad_norm": 0.62109375, + "learning_rate": 7.204354838709679e-05, + "loss": 0.136, + "step": 17834 + }, + { + "epoch": 0.28536, + "grad_norm": 1.046875, + "learning_rate": 7.204193548387097e-05, + "loss": 0.1544, + "step": 17835 + }, + { + "epoch": 0.285376, + "grad_norm": 0.98828125, + "learning_rate": 7.204032258064517e-05, + "loss": 0.17, + "step": 17836 + }, + { + "epoch": 0.285392, + "grad_norm": 1.6875, + "learning_rate": 7.203870967741936e-05, + "loss": 0.1549, + "step": 17837 + }, + { + "epoch": 0.285408, + "grad_norm": 0.6796875, + "learning_rate": 7.203709677419355e-05, + "loss": 0.1369, + "step": 17838 + }, + { + "epoch": 0.285424, + "grad_norm": 0.98828125, + "learning_rate": 7.203548387096775e-05, + "loss": 0.1552, + "step": 17839 + }, + { + "epoch": 0.28544, + "grad_norm": 0.859375, + "learning_rate": 7.203387096774193e-05, + "loss": 0.1674, + "step": 17840 + }, + { + "epoch": 0.285456, + "grad_norm": 0.67578125, + "learning_rate": 7.203225806451613e-05, + "loss": 0.1735, + "step": 17841 + }, + { + "epoch": 0.285472, + "grad_norm": 0.94921875, + "learning_rate": 7.203064516129032e-05, + "loss": 0.2385, + "step": 17842 + }, + { + "epoch": 0.285488, + "grad_norm": 1.4140625, + "learning_rate": 7.202903225806452e-05, + "loss": 0.1895, + "step": 17843 + }, + { + "epoch": 0.285504, + "grad_norm": 0.55859375, + "learning_rate": 7.202741935483872e-05, + "loss": 0.1604, + "step": 17844 + }, + { + "epoch": 0.28552, + "grad_norm": 0.55859375, + "learning_rate": 7.202580645161292e-05, + "loss": 0.1357, + "step": 17845 + }, + { + "epoch": 0.285536, + "grad_norm": 1.0859375, + "learning_rate": 7.20241935483871e-05, + "loss": 0.1491, + "step": 17846 + }, + { + "epoch": 0.285552, + "grad_norm": 0.71875, + "learning_rate": 7.20225806451613e-05, + "loss": 0.1753, + "step": 17847 + }, + { + "epoch": 0.285568, + "grad_norm": 0.56640625, + "learning_rate": 7.202096774193549e-05, + "loss": 0.1651, + "step": 17848 + }, + { + "epoch": 0.285584, + "grad_norm": 0.81640625, + "learning_rate": 7.201935483870969e-05, + "loss": 0.157, + "step": 17849 + }, + { + "epoch": 0.2856, + "grad_norm": 0.6484375, + "learning_rate": 7.201774193548387e-05, + "loss": 0.1657, + "step": 17850 + }, + { + "epoch": 0.285616, + "grad_norm": 0.73828125, + "learning_rate": 7.201612903225807e-05, + "loss": 0.2223, + "step": 17851 + }, + { + "epoch": 0.285632, + "grad_norm": 0.703125, + "learning_rate": 7.201451612903226e-05, + "loss": 0.1681, + "step": 17852 + }, + { + "epoch": 0.285648, + "grad_norm": 0.5625, + "learning_rate": 7.201290322580645e-05, + "loss": 0.1401, + "step": 17853 + }, + { + "epoch": 0.285664, + "grad_norm": 1.375, + "learning_rate": 7.201129032258065e-05, + "loss": 0.1708, + "step": 17854 + }, + { + "epoch": 0.28568, + "grad_norm": 0.65234375, + "learning_rate": 7.200967741935483e-05, + "loss": 0.1663, + "step": 17855 + }, + { + "epoch": 0.285696, + "grad_norm": 0.6796875, + "learning_rate": 7.200806451612903e-05, + "loss": 0.1723, + "step": 17856 + }, + { + "epoch": 0.285712, + "grad_norm": 0.92578125, + "learning_rate": 7.200645161290323e-05, + "loss": 0.1946, + "step": 17857 + }, + { + "epoch": 0.285728, + "grad_norm": 0.73046875, + "learning_rate": 7.200483870967743e-05, + "loss": 0.2052, + "step": 17858 + }, + { + "epoch": 0.285744, + "grad_norm": 0.498046875, + "learning_rate": 7.200322580645162e-05, + "loss": 0.1142, + "step": 17859 + }, + { + "epoch": 0.28576, + "grad_norm": 0.8671875, + "learning_rate": 7.200161290322582e-05, + "loss": 0.2164, + "step": 17860 + }, + { + "epoch": 0.285776, + "grad_norm": 0.78125, + "learning_rate": 7.2e-05, + "loss": 0.1804, + "step": 17861 + }, + { + "epoch": 0.285792, + "grad_norm": 1.078125, + "learning_rate": 7.19983870967742e-05, + "loss": 0.1858, + "step": 17862 + }, + { + "epoch": 0.285808, + "grad_norm": 0.62890625, + "learning_rate": 7.199677419354839e-05, + "loss": 0.1347, + "step": 17863 + }, + { + "epoch": 0.285824, + "grad_norm": 0.92578125, + "learning_rate": 7.199516129032259e-05, + "loss": 0.1746, + "step": 17864 + }, + { + "epoch": 0.28584, + "grad_norm": 0.9375, + "learning_rate": 7.199354838709677e-05, + "loss": 0.2206, + "step": 17865 + }, + { + "epoch": 0.285856, + "grad_norm": 0.7109375, + "learning_rate": 7.199193548387097e-05, + "loss": 0.1448, + "step": 17866 + }, + { + "epoch": 0.285872, + "grad_norm": 0.6484375, + "learning_rate": 7.199032258064516e-05, + "loss": 0.1721, + "step": 17867 + }, + { + "epoch": 0.285888, + "grad_norm": 0.83203125, + "learning_rate": 7.198870967741936e-05, + "loss": 0.1754, + "step": 17868 + }, + { + "epoch": 0.285904, + "grad_norm": 0.86328125, + "learning_rate": 7.198709677419356e-05, + "loss": 0.1988, + "step": 17869 + }, + { + "epoch": 0.28592, + "grad_norm": 0.9296875, + "learning_rate": 7.198548387096774e-05, + "loss": 0.1898, + "step": 17870 + }, + { + "epoch": 0.285936, + "grad_norm": 1.1015625, + "learning_rate": 7.198387096774194e-05, + "loss": 0.1946, + "step": 17871 + }, + { + "epoch": 0.285952, + "grad_norm": 0.81640625, + "learning_rate": 7.198225806451613e-05, + "loss": 0.1667, + "step": 17872 + }, + { + "epoch": 0.285968, + "grad_norm": 0.828125, + "learning_rate": 7.198064516129033e-05, + "loss": 0.173, + "step": 17873 + }, + { + "epoch": 0.285984, + "grad_norm": 1.46875, + "learning_rate": 7.197903225806452e-05, + "loss": 0.2145, + "step": 17874 + }, + { + "epoch": 0.286, + "grad_norm": 1.2578125, + "learning_rate": 7.197741935483872e-05, + "loss": 0.1684, + "step": 17875 + }, + { + "epoch": 0.286016, + "grad_norm": 0.81640625, + "learning_rate": 7.19758064516129e-05, + "loss": 0.1652, + "step": 17876 + }, + { + "epoch": 0.286032, + "grad_norm": 1.0390625, + "learning_rate": 7.19741935483871e-05, + "loss": 0.1478, + "step": 17877 + }, + { + "epoch": 0.286048, + "grad_norm": 0.92578125, + "learning_rate": 7.197258064516129e-05, + "loss": 0.2307, + "step": 17878 + }, + { + "epoch": 0.286064, + "grad_norm": 0.6015625, + "learning_rate": 7.197096774193549e-05, + "loss": 0.1611, + "step": 17879 + }, + { + "epoch": 0.28608, + "grad_norm": 0.8828125, + "learning_rate": 7.196935483870967e-05, + "loss": 0.1758, + "step": 17880 + }, + { + "epoch": 0.286096, + "grad_norm": 1.1796875, + "learning_rate": 7.196774193548387e-05, + "loss": 0.1185, + "step": 17881 + }, + { + "epoch": 0.286112, + "grad_norm": 0.59765625, + "learning_rate": 7.196612903225807e-05, + "loss": 0.1447, + "step": 17882 + }, + { + "epoch": 0.286128, + "grad_norm": 0.5703125, + "learning_rate": 7.196451612903227e-05, + "loss": 0.1703, + "step": 17883 + }, + { + "epoch": 0.286144, + "grad_norm": 0.63671875, + "learning_rate": 7.196290322580646e-05, + "loss": 0.1406, + "step": 17884 + }, + { + "epoch": 0.28616, + "grad_norm": 0.546875, + "learning_rate": 7.196129032258064e-05, + "loss": 0.1206, + "step": 17885 + }, + { + "epoch": 0.286176, + "grad_norm": 0.890625, + "learning_rate": 7.195967741935484e-05, + "loss": 0.2129, + "step": 17886 + }, + { + "epoch": 0.286192, + "grad_norm": 1.484375, + "learning_rate": 7.195806451612903e-05, + "loss": 0.1868, + "step": 17887 + }, + { + "epoch": 0.286208, + "grad_norm": 0.65625, + "learning_rate": 7.195645161290323e-05, + "loss": 0.1317, + "step": 17888 + }, + { + "epoch": 0.286224, + "grad_norm": 0.68359375, + "learning_rate": 7.195483870967742e-05, + "loss": 0.1671, + "step": 17889 + }, + { + "epoch": 0.28624, + "grad_norm": 0.6171875, + "learning_rate": 7.195322580645161e-05, + "loss": 0.1303, + "step": 17890 + }, + { + "epoch": 0.286256, + "grad_norm": 0.984375, + "learning_rate": 7.19516129032258e-05, + "loss": 0.1712, + "step": 17891 + }, + { + "epoch": 0.286272, + "grad_norm": 0.93359375, + "learning_rate": 7.195e-05, + "loss": 0.2074, + "step": 17892 + }, + { + "epoch": 0.286288, + "grad_norm": 0.80078125, + "learning_rate": 7.19483870967742e-05, + "loss": 0.1865, + "step": 17893 + }, + { + "epoch": 0.286304, + "grad_norm": 0.6484375, + "learning_rate": 7.19467741935484e-05, + "loss": 0.1636, + "step": 17894 + }, + { + "epoch": 0.28632, + "grad_norm": 1.1328125, + "learning_rate": 7.194516129032259e-05, + "loss": 0.2233, + "step": 17895 + }, + { + "epoch": 0.286336, + "grad_norm": 0.6796875, + "learning_rate": 7.194354838709679e-05, + "loss": 0.1453, + "step": 17896 + }, + { + "epoch": 0.286352, + "grad_norm": 0.75, + "learning_rate": 7.194193548387097e-05, + "loss": 0.1741, + "step": 17897 + }, + { + "epoch": 0.286368, + "grad_norm": 0.5859375, + "learning_rate": 7.194032258064517e-05, + "loss": 0.1743, + "step": 17898 + }, + { + "epoch": 0.286384, + "grad_norm": 0.7578125, + "learning_rate": 7.193870967741936e-05, + "loss": 0.1869, + "step": 17899 + }, + { + "epoch": 0.2864, + "grad_norm": 1.0859375, + "learning_rate": 7.193709677419354e-05, + "loss": 0.1735, + "step": 17900 + }, + { + "epoch": 0.286416, + "grad_norm": 1.3984375, + "learning_rate": 7.193548387096774e-05, + "loss": 0.208, + "step": 17901 + }, + { + "epoch": 0.286432, + "grad_norm": 0.73046875, + "learning_rate": 7.193387096774193e-05, + "loss": 0.1655, + "step": 17902 + }, + { + "epoch": 0.286448, + "grad_norm": 0.625, + "learning_rate": 7.193225806451613e-05, + "loss": 0.15, + "step": 17903 + }, + { + "epoch": 0.286464, + "grad_norm": 0.80859375, + "learning_rate": 7.193064516129033e-05, + "loss": 0.1758, + "step": 17904 + }, + { + "epoch": 0.28648, + "grad_norm": 0.94921875, + "learning_rate": 7.192903225806453e-05, + "loss": 0.1447, + "step": 17905 + }, + { + "epoch": 0.286496, + "grad_norm": 1.671875, + "learning_rate": 7.192741935483871e-05, + "loss": 0.2025, + "step": 17906 + }, + { + "epoch": 0.286512, + "grad_norm": 0.85546875, + "learning_rate": 7.192580645161291e-05, + "loss": 0.168, + "step": 17907 + }, + { + "epoch": 0.286528, + "grad_norm": 0.6640625, + "learning_rate": 7.19241935483871e-05, + "loss": 0.1327, + "step": 17908 + }, + { + "epoch": 0.286544, + "grad_norm": 0.96484375, + "learning_rate": 7.19225806451613e-05, + "loss": 0.1955, + "step": 17909 + }, + { + "epoch": 0.28656, + "grad_norm": 1.328125, + "learning_rate": 7.192096774193549e-05, + "loss": 0.202, + "step": 17910 + }, + { + "epoch": 0.286576, + "grad_norm": 0.71484375, + "learning_rate": 7.191935483870969e-05, + "loss": 0.1694, + "step": 17911 + }, + { + "epoch": 0.286592, + "grad_norm": 0.7578125, + "learning_rate": 7.191774193548387e-05, + "loss": 0.215, + "step": 17912 + }, + { + "epoch": 0.286608, + "grad_norm": 0.8515625, + "learning_rate": 7.191612903225807e-05, + "loss": 0.1591, + "step": 17913 + }, + { + "epoch": 0.286624, + "grad_norm": 0.74609375, + "learning_rate": 7.191451612903226e-05, + "loss": 0.1406, + "step": 17914 + }, + { + "epoch": 0.28664, + "grad_norm": 0.75, + "learning_rate": 7.191290322580646e-05, + "loss": 0.1594, + "step": 17915 + }, + { + "epoch": 0.286656, + "grad_norm": 1.15625, + "learning_rate": 7.191129032258064e-05, + "loss": 0.1621, + "step": 17916 + }, + { + "epoch": 0.286672, + "grad_norm": 1.5234375, + "learning_rate": 7.190967741935484e-05, + "loss": 0.1783, + "step": 17917 + }, + { + "epoch": 0.286688, + "grad_norm": 1.0390625, + "learning_rate": 7.190806451612904e-05, + "loss": 0.1524, + "step": 17918 + }, + { + "epoch": 0.286704, + "grad_norm": 0.7265625, + "learning_rate": 7.190645161290323e-05, + "loss": 0.1931, + "step": 17919 + }, + { + "epoch": 0.28672, + "grad_norm": 0.88671875, + "learning_rate": 7.190483870967743e-05, + "loss": 0.1554, + "step": 17920 + }, + { + "epoch": 0.286736, + "grad_norm": 0.71875, + "learning_rate": 7.190322580645161e-05, + "loss": 0.1566, + "step": 17921 + }, + { + "epoch": 0.286752, + "grad_norm": 0.859375, + "learning_rate": 7.190161290322581e-05, + "loss": 0.137, + "step": 17922 + }, + { + "epoch": 0.286768, + "grad_norm": 0.84375, + "learning_rate": 7.19e-05, + "loss": 0.1647, + "step": 17923 + }, + { + "epoch": 0.286784, + "grad_norm": 1.03125, + "learning_rate": 7.18983870967742e-05, + "loss": 0.1775, + "step": 17924 + }, + { + "epoch": 0.2868, + "grad_norm": 0.875, + "learning_rate": 7.189677419354839e-05, + "loss": 0.1967, + "step": 17925 + }, + { + "epoch": 0.286816, + "grad_norm": 0.9140625, + "learning_rate": 7.189516129032258e-05, + "loss": 0.1554, + "step": 17926 + }, + { + "epoch": 0.286832, + "grad_norm": 0.8125, + "learning_rate": 7.189354838709677e-05, + "loss": 0.1363, + "step": 17927 + }, + { + "epoch": 0.286848, + "grad_norm": 0.6484375, + "learning_rate": 7.189193548387097e-05, + "loss": 0.1579, + "step": 17928 + }, + { + "epoch": 0.286864, + "grad_norm": 1.0625, + "learning_rate": 7.189032258064517e-05, + "loss": 0.1865, + "step": 17929 + }, + { + "epoch": 0.28688, + "grad_norm": 0.80859375, + "learning_rate": 7.188870967741937e-05, + "loss": 0.1624, + "step": 17930 + }, + { + "epoch": 0.286896, + "grad_norm": 0.5, + "learning_rate": 7.188709677419356e-05, + "loss": 0.1365, + "step": 17931 + }, + { + "epoch": 0.286912, + "grad_norm": 0.7109375, + "learning_rate": 7.188548387096774e-05, + "loss": 0.1584, + "step": 17932 + }, + { + "epoch": 0.286928, + "grad_norm": 1.046875, + "learning_rate": 7.188387096774194e-05, + "loss": 0.1372, + "step": 17933 + }, + { + "epoch": 0.286944, + "grad_norm": 1.1328125, + "learning_rate": 7.188225806451613e-05, + "loss": 0.2213, + "step": 17934 + }, + { + "epoch": 0.28696, + "grad_norm": 0.6640625, + "learning_rate": 7.188064516129033e-05, + "loss": 0.1612, + "step": 17935 + }, + { + "epoch": 0.286976, + "grad_norm": 0.99609375, + "learning_rate": 7.187903225806451e-05, + "loss": 0.2026, + "step": 17936 + }, + { + "epoch": 0.286992, + "grad_norm": 0.9453125, + "learning_rate": 7.187741935483871e-05, + "loss": 0.1844, + "step": 17937 + }, + { + "epoch": 0.287008, + "grad_norm": 0.56640625, + "learning_rate": 7.18758064516129e-05, + "loss": 0.1738, + "step": 17938 + }, + { + "epoch": 0.287024, + "grad_norm": 1.140625, + "learning_rate": 7.18741935483871e-05, + "loss": 0.1637, + "step": 17939 + }, + { + "epoch": 0.28704, + "grad_norm": 1.0234375, + "learning_rate": 7.18725806451613e-05, + "loss": 0.1767, + "step": 17940 + }, + { + "epoch": 0.287056, + "grad_norm": 0.68359375, + "learning_rate": 7.187096774193548e-05, + "loss": 0.1823, + "step": 17941 + }, + { + "epoch": 0.287072, + "grad_norm": 0.9140625, + "learning_rate": 7.186935483870968e-05, + "loss": 0.1656, + "step": 17942 + }, + { + "epoch": 0.287088, + "grad_norm": 0.890625, + "learning_rate": 7.186774193548388e-05, + "loss": 0.1966, + "step": 17943 + }, + { + "epoch": 0.287104, + "grad_norm": 0.921875, + "learning_rate": 7.186612903225807e-05, + "loss": 0.1797, + "step": 17944 + }, + { + "epoch": 0.28712, + "grad_norm": 0.8125, + "learning_rate": 7.186451612903227e-05, + "loss": 0.172, + "step": 17945 + }, + { + "epoch": 0.287136, + "grad_norm": 0.89453125, + "learning_rate": 7.186290322580646e-05, + "loss": 0.1779, + "step": 17946 + }, + { + "epoch": 0.287152, + "grad_norm": 1.109375, + "learning_rate": 7.186129032258064e-05, + "loss": 0.2105, + "step": 17947 + }, + { + "epoch": 0.287168, + "grad_norm": 0.84765625, + "learning_rate": 7.185967741935484e-05, + "loss": 0.1836, + "step": 17948 + }, + { + "epoch": 0.287184, + "grad_norm": 1.1015625, + "learning_rate": 7.185806451612903e-05, + "loss": 0.1505, + "step": 17949 + }, + { + "epoch": 0.2872, + "grad_norm": 0.546875, + "learning_rate": 7.185645161290323e-05, + "loss": 0.153, + "step": 17950 + }, + { + "epoch": 0.287216, + "grad_norm": 1.0625, + "learning_rate": 7.185483870967741e-05, + "loss": 0.2282, + "step": 17951 + }, + { + "epoch": 0.287232, + "grad_norm": 0.77734375, + "learning_rate": 7.185322580645161e-05, + "loss": 0.1224, + "step": 17952 + }, + { + "epoch": 0.287248, + "grad_norm": 0.8671875, + "learning_rate": 7.185161290322581e-05, + "loss": 0.1529, + "step": 17953 + }, + { + "epoch": 0.287264, + "grad_norm": 0.57421875, + "learning_rate": 7.185000000000001e-05, + "loss": 0.1338, + "step": 17954 + }, + { + "epoch": 0.28728, + "grad_norm": 1.390625, + "learning_rate": 7.18483870967742e-05, + "loss": 0.1934, + "step": 17955 + }, + { + "epoch": 0.287296, + "grad_norm": 1.4765625, + "learning_rate": 7.18467741935484e-05, + "loss": 0.2188, + "step": 17956 + }, + { + "epoch": 0.287312, + "grad_norm": 1.25, + "learning_rate": 7.184516129032258e-05, + "loss": 0.1631, + "step": 17957 + }, + { + "epoch": 0.287328, + "grad_norm": 0.80078125, + "learning_rate": 7.184354838709678e-05, + "loss": 0.1792, + "step": 17958 + }, + { + "epoch": 0.287344, + "grad_norm": 0.7890625, + "learning_rate": 7.184193548387097e-05, + "loss": 0.1741, + "step": 17959 + }, + { + "epoch": 0.28736, + "grad_norm": 0.71875, + "learning_rate": 7.184032258064517e-05, + "loss": 0.1545, + "step": 17960 + }, + { + "epoch": 0.287376, + "grad_norm": 0.6015625, + "learning_rate": 7.183870967741935e-05, + "loss": 0.189, + "step": 17961 + }, + { + "epoch": 0.287392, + "grad_norm": 0.69921875, + "learning_rate": 7.183709677419354e-05, + "loss": 0.1843, + "step": 17962 + }, + { + "epoch": 0.287408, + "grad_norm": 0.84375, + "learning_rate": 7.183548387096774e-05, + "loss": 0.1326, + "step": 17963 + }, + { + "epoch": 0.287424, + "grad_norm": 0.7265625, + "learning_rate": 7.183387096774194e-05, + "loss": 0.1892, + "step": 17964 + }, + { + "epoch": 0.28744, + "grad_norm": 0.55078125, + "learning_rate": 7.183225806451614e-05, + "loss": 0.1282, + "step": 17965 + }, + { + "epoch": 0.287456, + "grad_norm": 0.494140625, + "learning_rate": 7.183064516129033e-05, + "loss": 0.1478, + "step": 17966 + }, + { + "epoch": 0.287472, + "grad_norm": 0.64453125, + "learning_rate": 7.182903225806453e-05, + "loss": 0.1859, + "step": 17967 + }, + { + "epoch": 0.287488, + "grad_norm": 0.875, + "learning_rate": 7.182741935483871e-05, + "loss": 0.1911, + "step": 17968 + }, + { + "epoch": 0.287504, + "grad_norm": 0.66015625, + "learning_rate": 7.182580645161291e-05, + "loss": 0.153, + "step": 17969 + }, + { + "epoch": 0.28752, + "grad_norm": 0.765625, + "learning_rate": 7.18241935483871e-05, + "loss": 0.1783, + "step": 17970 + }, + { + "epoch": 0.287536, + "grad_norm": 0.8203125, + "learning_rate": 7.18225806451613e-05, + "loss": 0.1375, + "step": 17971 + }, + { + "epoch": 0.287552, + "grad_norm": 0.8671875, + "learning_rate": 7.182096774193548e-05, + "loss": 0.2075, + "step": 17972 + }, + { + "epoch": 0.287568, + "grad_norm": 0.58984375, + "learning_rate": 7.181935483870968e-05, + "loss": 0.1405, + "step": 17973 + }, + { + "epoch": 0.287584, + "grad_norm": 0.88671875, + "learning_rate": 7.181774193548387e-05, + "loss": 0.163, + "step": 17974 + }, + { + "epoch": 0.2876, + "grad_norm": 0.8046875, + "learning_rate": 7.181612903225807e-05, + "loss": 0.1412, + "step": 17975 + }, + { + "epoch": 0.287616, + "grad_norm": 0.58984375, + "learning_rate": 7.181451612903225e-05, + "loss": 0.198, + "step": 17976 + }, + { + "epoch": 0.287632, + "grad_norm": 0.71484375, + "learning_rate": 7.181290322580645e-05, + "loss": 0.1403, + "step": 17977 + }, + { + "epoch": 0.287648, + "grad_norm": 0.73828125, + "learning_rate": 7.181129032258065e-05, + "loss": 0.1646, + "step": 17978 + }, + { + "epoch": 0.287664, + "grad_norm": 0.74609375, + "learning_rate": 7.180967741935484e-05, + "loss": 0.1931, + "step": 17979 + }, + { + "epoch": 0.28768, + "grad_norm": 0.73046875, + "learning_rate": 7.180806451612904e-05, + "loss": 0.1536, + "step": 17980 + }, + { + "epoch": 0.287696, + "grad_norm": 0.88671875, + "learning_rate": 7.180645161290323e-05, + "loss": 0.1999, + "step": 17981 + }, + { + "epoch": 0.287712, + "grad_norm": 0.73046875, + "learning_rate": 7.180483870967743e-05, + "loss": 0.1836, + "step": 17982 + }, + { + "epoch": 0.287728, + "grad_norm": 0.8203125, + "learning_rate": 7.180322580645161e-05, + "loss": 0.1549, + "step": 17983 + }, + { + "epoch": 0.287744, + "grad_norm": 1.1328125, + "learning_rate": 7.180161290322581e-05, + "loss": 0.1678, + "step": 17984 + }, + { + "epoch": 0.28776, + "grad_norm": 0.8203125, + "learning_rate": 7.18e-05, + "loss": 0.2062, + "step": 17985 + }, + { + "epoch": 0.287776, + "grad_norm": 0.6015625, + "learning_rate": 7.17983870967742e-05, + "loss": 0.1469, + "step": 17986 + }, + { + "epoch": 0.287792, + "grad_norm": 0.796875, + "learning_rate": 7.179677419354838e-05, + "loss": 0.1546, + "step": 17987 + }, + { + "epoch": 0.287808, + "grad_norm": 0.76171875, + "learning_rate": 7.179516129032258e-05, + "loss": 0.1553, + "step": 17988 + }, + { + "epoch": 0.287824, + "grad_norm": 0.455078125, + "learning_rate": 7.179354838709678e-05, + "loss": 0.1464, + "step": 17989 + }, + { + "epoch": 0.28784, + "grad_norm": 0.52734375, + "learning_rate": 7.179193548387098e-05, + "loss": 0.1818, + "step": 17990 + }, + { + "epoch": 0.287856, + "grad_norm": 0.5234375, + "learning_rate": 7.179032258064517e-05, + "loss": 0.1228, + "step": 17991 + }, + { + "epoch": 0.287872, + "grad_norm": 0.578125, + "learning_rate": 7.178870967741937e-05, + "loss": 0.14, + "step": 17992 + }, + { + "epoch": 0.287888, + "grad_norm": 0.66796875, + "learning_rate": 7.178709677419355e-05, + "loss": 0.1753, + "step": 17993 + }, + { + "epoch": 0.287904, + "grad_norm": 0.7109375, + "learning_rate": 7.178548387096774e-05, + "loss": 0.1555, + "step": 17994 + }, + { + "epoch": 0.28792, + "grad_norm": 0.8671875, + "learning_rate": 7.178387096774194e-05, + "loss": 0.1648, + "step": 17995 + }, + { + "epoch": 0.287936, + "grad_norm": 0.76953125, + "learning_rate": 7.178225806451613e-05, + "loss": 0.1911, + "step": 17996 + }, + { + "epoch": 0.287952, + "grad_norm": 0.625, + "learning_rate": 7.178064516129032e-05, + "loss": 0.1637, + "step": 17997 + }, + { + "epoch": 0.287968, + "grad_norm": 0.578125, + "learning_rate": 7.177903225806451e-05, + "loss": 0.167, + "step": 17998 + }, + { + "epoch": 0.287984, + "grad_norm": 0.734375, + "learning_rate": 7.177741935483871e-05, + "loss": 0.156, + "step": 17999 + }, + { + "epoch": 0.288, + "grad_norm": 1.1953125, + "learning_rate": 7.177580645161291e-05, + "loss": 0.1754, + "step": 18000 + }, + { + "epoch": 0.288016, + "grad_norm": 1.0859375, + "learning_rate": 7.177419354838711e-05, + "loss": 0.1853, + "step": 18001 + }, + { + "epoch": 0.288032, + "grad_norm": 0.72265625, + "learning_rate": 7.17725806451613e-05, + "loss": 0.183, + "step": 18002 + }, + { + "epoch": 0.288048, + "grad_norm": 0.78515625, + "learning_rate": 7.17709677419355e-05, + "loss": 0.1947, + "step": 18003 + }, + { + "epoch": 0.288064, + "grad_norm": 0.61328125, + "learning_rate": 7.176935483870968e-05, + "loss": 0.0988, + "step": 18004 + }, + { + "epoch": 0.28808, + "grad_norm": 0.67578125, + "learning_rate": 7.176774193548388e-05, + "loss": 0.1455, + "step": 18005 + }, + { + "epoch": 0.288096, + "grad_norm": 0.72265625, + "learning_rate": 7.176612903225807e-05, + "loss": 0.1694, + "step": 18006 + }, + { + "epoch": 0.288112, + "grad_norm": 0.765625, + "learning_rate": 7.176451612903227e-05, + "loss": 0.2039, + "step": 18007 + }, + { + "epoch": 0.288128, + "grad_norm": 0.80859375, + "learning_rate": 7.176290322580645e-05, + "loss": 0.1818, + "step": 18008 + }, + { + "epoch": 0.288144, + "grad_norm": 0.60546875, + "learning_rate": 7.176129032258064e-05, + "loss": 0.1735, + "step": 18009 + }, + { + "epoch": 0.28816, + "grad_norm": 0.8515625, + "learning_rate": 7.175967741935484e-05, + "loss": 0.1475, + "step": 18010 + }, + { + "epoch": 0.288176, + "grad_norm": 1.2578125, + "learning_rate": 7.175806451612902e-05, + "loss": 0.1654, + "step": 18011 + }, + { + "epoch": 0.288192, + "grad_norm": 0.59375, + "learning_rate": 7.175645161290322e-05, + "loss": 0.2004, + "step": 18012 + }, + { + "epoch": 0.288208, + "grad_norm": 0.6328125, + "learning_rate": 7.175483870967742e-05, + "loss": 0.1376, + "step": 18013 + }, + { + "epoch": 0.288224, + "grad_norm": 0.8828125, + "learning_rate": 7.175322580645162e-05, + "loss": 0.1752, + "step": 18014 + }, + { + "epoch": 0.28824, + "grad_norm": 0.73828125, + "learning_rate": 7.175161290322581e-05, + "loss": 0.1442, + "step": 18015 + }, + { + "epoch": 0.288256, + "grad_norm": 0.78515625, + "learning_rate": 7.175000000000001e-05, + "loss": 0.1678, + "step": 18016 + }, + { + "epoch": 0.288272, + "grad_norm": 0.50390625, + "learning_rate": 7.17483870967742e-05, + "loss": 0.1402, + "step": 18017 + }, + { + "epoch": 0.288288, + "grad_norm": 1.109375, + "learning_rate": 7.17467741935484e-05, + "loss": 0.1961, + "step": 18018 + }, + { + "epoch": 0.288304, + "grad_norm": 1.359375, + "learning_rate": 7.174516129032258e-05, + "loss": 0.2125, + "step": 18019 + }, + { + "epoch": 0.28832, + "grad_norm": 0.65625, + "learning_rate": 7.174354838709678e-05, + "loss": 0.1627, + "step": 18020 + }, + { + "epoch": 0.288336, + "grad_norm": 1.21875, + "learning_rate": 7.174193548387097e-05, + "loss": 0.2053, + "step": 18021 + }, + { + "epoch": 0.288352, + "grad_norm": 0.95703125, + "learning_rate": 7.174032258064517e-05, + "loss": 0.1289, + "step": 18022 + }, + { + "epoch": 0.288368, + "grad_norm": 0.6875, + "learning_rate": 7.173870967741935e-05, + "loss": 0.1606, + "step": 18023 + }, + { + "epoch": 0.288384, + "grad_norm": 0.99609375, + "learning_rate": 7.173709677419355e-05, + "loss": 0.1785, + "step": 18024 + }, + { + "epoch": 0.2884, + "grad_norm": 1.171875, + "learning_rate": 7.173548387096775e-05, + "loss": 0.1572, + "step": 18025 + }, + { + "epoch": 0.288416, + "grad_norm": 0.9375, + "learning_rate": 7.173387096774194e-05, + "loss": 0.2321, + "step": 18026 + }, + { + "epoch": 0.288432, + "grad_norm": 0.8671875, + "learning_rate": 7.173225806451614e-05, + "loss": 0.108, + "step": 18027 + }, + { + "epoch": 0.288448, + "grad_norm": 0.83203125, + "learning_rate": 7.173064516129032e-05, + "loss": 0.1781, + "step": 18028 + }, + { + "epoch": 0.288464, + "grad_norm": 1.625, + "learning_rate": 7.172903225806452e-05, + "loss": 0.173, + "step": 18029 + }, + { + "epoch": 0.28848, + "grad_norm": 0.90234375, + "learning_rate": 7.172741935483871e-05, + "loss": 0.1357, + "step": 18030 + }, + { + "epoch": 0.288496, + "grad_norm": 1.21875, + "learning_rate": 7.172580645161291e-05, + "loss": 0.1789, + "step": 18031 + }, + { + "epoch": 0.288512, + "grad_norm": 0.875, + "learning_rate": 7.17241935483871e-05, + "loss": 0.1587, + "step": 18032 + }, + { + "epoch": 0.288528, + "grad_norm": 0.68359375, + "learning_rate": 7.17225806451613e-05, + "loss": 0.1634, + "step": 18033 + }, + { + "epoch": 0.288544, + "grad_norm": 0.79296875, + "learning_rate": 7.172096774193548e-05, + "loss": 0.218, + "step": 18034 + }, + { + "epoch": 0.28856, + "grad_norm": 0.671875, + "learning_rate": 7.171935483870968e-05, + "loss": 0.1628, + "step": 18035 + }, + { + "epoch": 0.288576, + "grad_norm": 0.5859375, + "learning_rate": 7.171774193548387e-05, + "loss": 0.1687, + "step": 18036 + }, + { + "epoch": 0.288592, + "grad_norm": 0.66796875, + "learning_rate": 7.171612903225807e-05, + "loss": 0.1626, + "step": 18037 + }, + { + "epoch": 0.288608, + "grad_norm": 1.421875, + "learning_rate": 7.171451612903227e-05, + "loss": 0.1279, + "step": 18038 + }, + { + "epoch": 0.288624, + "grad_norm": 0.70703125, + "learning_rate": 7.171290322580647e-05, + "loss": 0.1725, + "step": 18039 + }, + { + "epoch": 0.28864, + "grad_norm": 0.57421875, + "learning_rate": 7.171129032258065e-05, + "loss": 0.1779, + "step": 18040 + }, + { + "epoch": 0.288656, + "grad_norm": 0.65234375, + "learning_rate": 7.170967741935484e-05, + "loss": 0.149, + "step": 18041 + }, + { + "epoch": 0.288672, + "grad_norm": 0.7265625, + "learning_rate": 7.170806451612904e-05, + "loss": 0.2173, + "step": 18042 + }, + { + "epoch": 0.288688, + "grad_norm": 1.5234375, + "learning_rate": 7.170645161290322e-05, + "loss": 0.1672, + "step": 18043 + }, + { + "epoch": 0.288704, + "grad_norm": 1.046875, + "learning_rate": 7.170483870967742e-05, + "loss": 0.1855, + "step": 18044 + }, + { + "epoch": 0.28872, + "grad_norm": 1.171875, + "learning_rate": 7.170322580645161e-05, + "loss": 0.1651, + "step": 18045 + }, + { + "epoch": 0.288736, + "grad_norm": 0.86328125, + "learning_rate": 7.170161290322581e-05, + "loss": 0.1845, + "step": 18046 + }, + { + "epoch": 0.288752, + "grad_norm": 1.015625, + "learning_rate": 7.17e-05, + "loss": 0.2153, + "step": 18047 + }, + { + "epoch": 0.288768, + "grad_norm": 1.109375, + "learning_rate": 7.16983870967742e-05, + "loss": 0.2138, + "step": 18048 + }, + { + "epoch": 0.288784, + "grad_norm": 0.59375, + "learning_rate": 7.16967741935484e-05, + "loss": 0.1509, + "step": 18049 + }, + { + "epoch": 0.2888, + "grad_norm": 0.890625, + "learning_rate": 7.16951612903226e-05, + "loss": 0.1708, + "step": 18050 + }, + { + "epoch": 0.288816, + "grad_norm": 0.7109375, + "learning_rate": 7.169354838709678e-05, + "loss": 0.1864, + "step": 18051 + }, + { + "epoch": 0.288832, + "grad_norm": 0.6171875, + "learning_rate": 7.169193548387098e-05, + "loss": 0.1449, + "step": 18052 + }, + { + "epoch": 0.288848, + "grad_norm": 0.76171875, + "learning_rate": 7.169032258064517e-05, + "loss": 0.1504, + "step": 18053 + }, + { + "epoch": 0.288864, + "grad_norm": 0.734375, + "learning_rate": 7.168870967741936e-05, + "loss": 0.1479, + "step": 18054 + }, + { + "epoch": 0.28888, + "grad_norm": 0.96875, + "learning_rate": 7.168709677419355e-05, + "loss": 0.1756, + "step": 18055 + }, + { + "epoch": 0.288896, + "grad_norm": 0.84765625, + "learning_rate": 7.168548387096774e-05, + "loss": 0.1899, + "step": 18056 + }, + { + "epoch": 0.288912, + "grad_norm": 0.70703125, + "learning_rate": 7.168387096774194e-05, + "loss": 0.1615, + "step": 18057 + }, + { + "epoch": 0.288928, + "grad_norm": 0.7265625, + "learning_rate": 7.168225806451612e-05, + "loss": 0.1994, + "step": 18058 + }, + { + "epoch": 0.288944, + "grad_norm": 1.34375, + "learning_rate": 7.168064516129032e-05, + "loss": 0.1674, + "step": 18059 + }, + { + "epoch": 0.28896, + "grad_norm": 1.28125, + "learning_rate": 7.167903225806452e-05, + "loss": 0.1711, + "step": 18060 + }, + { + "epoch": 0.288976, + "grad_norm": 0.734375, + "learning_rate": 7.167741935483872e-05, + "loss": 0.1873, + "step": 18061 + }, + { + "epoch": 0.288992, + "grad_norm": 0.59375, + "learning_rate": 7.167580645161291e-05, + "loss": 0.1174, + "step": 18062 + }, + { + "epoch": 0.289008, + "grad_norm": 0.56640625, + "learning_rate": 7.167419354838711e-05, + "loss": 0.149, + "step": 18063 + }, + { + "epoch": 0.289024, + "grad_norm": 0.828125, + "learning_rate": 7.16725806451613e-05, + "loss": 0.1845, + "step": 18064 + }, + { + "epoch": 0.28904, + "grad_norm": 0.61328125, + "learning_rate": 7.167096774193549e-05, + "loss": 0.1646, + "step": 18065 + }, + { + "epoch": 0.289056, + "grad_norm": 0.82421875, + "learning_rate": 7.166935483870968e-05, + "loss": 0.1944, + "step": 18066 + }, + { + "epoch": 0.289072, + "grad_norm": 0.9375, + "learning_rate": 7.166774193548388e-05, + "loss": 0.1801, + "step": 18067 + }, + { + "epoch": 0.289088, + "grad_norm": 0.92578125, + "learning_rate": 7.166612903225806e-05, + "loss": 0.1569, + "step": 18068 + }, + { + "epoch": 0.289104, + "grad_norm": 0.90234375, + "learning_rate": 7.166451612903226e-05, + "loss": 0.1471, + "step": 18069 + }, + { + "epoch": 0.28912, + "grad_norm": 0.8828125, + "learning_rate": 7.166290322580645e-05, + "loss": 0.174, + "step": 18070 + }, + { + "epoch": 0.289136, + "grad_norm": 0.640625, + "learning_rate": 7.166129032258064e-05, + "loss": 0.1722, + "step": 18071 + }, + { + "epoch": 0.289152, + "grad_norm": 0.6953125, + "learning_rate": 7.165967741935484e-05, + "loss": 0.1563, + "step": 18072 + }, + { + "epoch": 0.289168, + "grad_norm": 0.6796875, + "learning_rate": 7.165806451612904e-05, + "loss": 0.1507, + "step": 18073 + }, + { + "epoch": 0.289184, + "grad_norm": 0.609375, + "learning_rate": 7.165645161290324e-05, + "loss": 0.1904, + "step": 18074 + }, + { + "epoch": 0.2892, + "grad_norm": 0.88671875, + "learning_rate": 7.165483870967742e-05, + "loss": 0.157, + "step": 18075 + }, + { + "epoch": 0.289216, + "grad_norm": 0.83203125, + "learning_rate": 7.165322580645162e-05, + "loss": 0.126, + "step": 18076 + }, + { + "epoch": 0.289232, + "grad_norm": 0.59765625, + "learning_rate": 7.165161290322581e-05, + "loss": 0.1961, + "step": 18077 + }, + { + "epoch": 0.289248, + "grad_norm": 0.82421875, + "learning_rate": 7.165000000000001e-05, + "loss": 0.1683, + "step": 18078 + }, + { + "epoch": 0.289264, + "grad_norm": 0.60546875, + "learning_rate": 7.164838709677419e-05, + "loss": 0.1487, + "step": 18079 + }, + { + "epoch": 0.28928, + "grad_norm": 0.61328125, + "learning_rate": 7.164677419354839e-05, + "loss": 0.1494, + "step": 18080 + }, + { + "epoch": 0.289296, + "grad_norm": 0.86328125, + "learning_rate": 7.164516129032258e-05, + "loss": 0.1452, + "step": 18081 + }, + { + "epoch": 0.289312, + "grad_norm": 1.3203125, + "learning_rate": 7.164354838709678e-05, + "loss": 0.1811, + "step": 18082 + }, + { + "epoch": 0.289328, + "grad_norm": 0.77734375, + "learning_rate": 7.164193548387096e-05, + "loss": 0.18, + "step": 18083 + }, + { + "epoch": 0.289344, + "grad_norm": 0.66015625, + "learning_rate": 7.164032258064516e-05, + "loss": 0.1928, + "step": 18084 + }, + { + "epoch": 0.28936, + "grad_norm": 0.84375, + "learning_rate": 7.163870967741936e-05, + "loss": 0.1791, + "step": 18085 + }, + { + "epoch": 0.289376, + "grad_norm": 0.6796875, + "learning_rate": 7.163709677419356e-05, + "loss": 0.1465, + "step": 18086 + }, + { + "epoch": 0.289392, + "grad_norm": 1.1328125, + "learning_rate": 7.163548387096775e-05, + "loss": 0.1957, + "step": 18087 + }, + { + "epoch": 0.289408, + "grad_norm": 0.65625, + "learning_rate": 7.163387096774194e-05, + "loss": 0.1828, + "step": 18088 + }, + { + "epoch": 0.289424, + "grad_norm": 0.71875, + "learning_rate": 7.163225806451614e-05, + "loss": 0.151, + "step": 18089 + }, + { + "epoch": 0.28944, + "grad_norm": 0.9609375, + "learning_rate": 7.163064516129032e-05, + "loss": 0.185, + "step": 18090 + }, + { + "epoch": 0.289456, + "grad_norm": 0.69921875, + "learning_rate": 7.162903225806452e-05, + "loss": 0.1629, + "step": 18091 + }, + { + "epoch": 0.289472, + "grad_norm": 0.734375, + "learning_rate": 7.162741935483871e-05, + "loss": 0.1529, + "step": 18092 + }, + { + "epoch": 0.289488, + "grad_norm": 0.984375, + "learning_rate": 7.16258064516129e-05, + "loss": 0.2062, + "step": 18093 + }, + { + "epoch": 0.289504, + "grad_norm": 0.875, + "learning_rate": 7.162419354838709e-05, + "loss": 0.1373, + "step": 18094 + }, + { + "epoch": 0.28952, + "grad_norm": 0.921875, + "learning_rate": 7.162258064516129e-05, + "loss": 0.2122, + "step": 18095 + }, + { + "epoch": 0.289536, + "grad_norm": 0.96875, + "learning_rate": 7.162096774193549e-05, + "loss": 0.1486, + "step": 18096 + }, + { + "epoch": 0.289552, + "grad_norm": 1.125, + "learning_rate": 7.161935483870969e-05, + "loss": 0.1428, + "step": 18097 + }, + { + "epoch": 0.289568, + "grad_norm": 0.61328125, + "learning_rate": 7.161774193548388e-05, + "loss": 0.1742, + "step": 18098 + }, + { + "epoch": 0.289584, + "grad_norm": 0.85546875, + "learning_rate": 7.161612903225808e-05, + "loss": 0.1862, + "step": 18099 + }, + { + "epoch": 0.2896, + "grad_norm": 0.84765625, + "learning_rate": 7.161451612903226e-05, + "loss": 0.1433, + "step": 18100 + }, + { + "epoch": 0.289616, + "grad_norm": 0.85546875, + "learning_rate": 7.161290322580646e-05, + "loss": 0.1724, + "step": 18101 + }, + { + "epoch": 0.289632, + "grad_norm": 0.5625, + "learning_rate": 7.161129032258065e-05, + "loss": 0.1487, + "step": 18102 + }, + { + "epoch": 0.289648, + "grad_norm": 0.72265625, + "learning_rate": 7.160967741935483e-05, + "loss": 0.1319, + "step": 18103 + }, + { + "epoch": 0.289664, + "grad_norm": 0.8671875, + "learning_rate": 7.160806451612903e-05, + "loss": 0.2024, + "step": 18104 + }, + { + "epoch": 0.28968, + "grad_norm": 0.984375, + "learning_rate": 7.160645161290322e-05, + "loss": 0.1649, + "step": 18105 + }, + { + "epoch": 0.289696, + "grad_norm": 0.6484375, + "learning_rate": 7.160483870967742e-05, + "loss": 0.1508, + "step": 18106 + }, + { + "epoch": 0.289712, + "grad_norm": 0.78515625, + "learning_rate": 7.16032258064516e-05, + "loss": 0.1495, + "step": 18107 + }, + { + "epoch": 0.289728, + "grad_norm": 0.5703125, + "learning_rate": 7.16016129032258e-05, + "loss": 0.1665, + "step": 18108 + }, + { + "epoch": 0.289744, + "grad_norm": 0.62109375, + "learning_rate": 7.16e-05, + "loss": 0.1712, + "step": 18109 + }, + { + "epoch": 0.28976, + "grad_norm": 0.76171875, + "learning_rate": 7.15983870967742e-05, + "loss": 0.1767, + "step": 18110 + }, + { + "epoch": 0.289776, + "grad_norm": 0.71875, + "learning_rate": 7.159677419354839e-05, + "loss": 0.1593, + "step": 18111 + }, + { + "epoch": 0.289792, + "grad_norm": 1.109375, + "learning_rate": 7.159516129032259e-05, + "loss": 0.1625, + "step": 18112 + }, + { + "epoch": 0.289808, + "grad_norm": 1.0859375, + "learning_rate": 7.159354838709678e-05, + "loss": 0.1765, + "step": 18113 + }, + { + "epoch": 0.289824, + "grad_norm": 0.66015625, + "learning_rate": 7.159193548387098e-05, + "loss": 0.1357, + "step": 18114 + }, + { + "epoch": 0.28984, + "grad_norm": 0.796875, + "learning_rate": 7.159032258064516e-05, + "loss": 0.1657, + "step": 18115 + }, + { + "epoch": 0.289856, + "grad_norm": 1.03125, + "learning_rate": 7.158870967741936e-05, + "loss": 0.1658, + "step": 18116 + }, + { + "epoch": 0.289872, + "grad_norm": 0.95703125, + "learning_rate": 7.158709677419355e-05, + "loss": 0.182, + "step": 18117 + }, + { + "epoch": 0.289888, + "grad_norm": 1.0703125, + "learning_rate": 7.158548387096773e-05, + "loss": 0.1186, + "step": 18118 + }, + { + "epoch": 0.289904, + "grad_norm": 1.2265625, + "learning_rate": 7.158387096774193e-05, + "loss": 0.1788, + "step": 18119 + }, + { + "epoch": 0.28992, + "grad_norm": 0.828125, + "learning_rate": 7.158225806451613e-05, + "loss": 0.1508, + "step": 18120 + }, + { + "epoch": 0.289936, + "grad_norm": 2.234375, + "learning_rate": 7.158064516129033e-05, + "loss": 0.1702, + "step": 18121 + }, + { + "epoch": 0.289952, + "grad_norm": 1.171875, + "learning_rate": 7.157903225806452e-05, + "loss": 0.1621, + "step": 18122 + }, + { + "epoch": 0.289968, + "grad_norm": 0.69140625, + "learning_rate": 7.157741935483872e-05, + "loss": 0.1949, + "step": 18123 + }, + { + "epoch": 0.289984, + "grad_norm": 0.62890625, + "learning_rate": 7.15758064516129e-05, + "loss": 0.1618, + "step": 18124 + }, + { + "epoch": 0.29, + "grad_norm": 1.3125, + "learning_rate": 7.15741935483871e-05, + "loss": 0.1641, + "step": 18125 + }, + { + "epoch": 0.290016, + "grad_norm": 1.171875, + "learning_rate": 7.157258064516129e-05, + "loss": 0.1498, + "step": 18126 + }, + { + "epoch": 0.290032, + "grad_norm": 0.86328125, + "learning_rate": 7.157096774193549e-05, + "loss": 0.1813, + "step": 18127 + }, + { + "epoch": 0.290048, + "grad_norm": 0.82421875, + "learning_rate": 7.156935483870968e-05, + "loss": 0.1944, + "step": 18128 + }, + { + "epoch": 0.290064, + "grad_norm": 0.6953125, + "learning_rate": 7.156774193548388e-05, + "loss": 0.1635, + "step": 18129 + }, + { + "epoch": 0.29008, + "grad_norm": 0.78515625, + "learning_rate": 7.156612903225806e-05, + "loss": 0.1925, + "step": 18130 + }, + { + "epoch": 0.290096, + "grad_norm": 0.57421875, + "learning_rate": 7.156451612903226e-05, + "loss": 0.1706, + "step": 18131 + }, + { + "epoch": 0.290112, + "grad_norm": 1.1328125, + "learning_rate": 7.156290322580645e-05, + "loss": 0.1711, + "step": 18132 + }, + { + "epoch": 0.290128, + "grad_norm": 1.1796875, + "learning_rate": 7.156129032258065e-05, + "loss": 0.1693, + "step": 18133 + }, + { + "epoch": 0.290144, + "grad_norm": 1.3359375, + "learning_rate": 7.155967741935485e-05, + "loss": 0.1945, + "step": 18134 + }, + { + "epoch": 0.29016, + "grad_norm": 0.56640625, + "learning_rate": 7.155806451612903e-05, + "loss": 0.1516, + "step": 18135 + }, + { + "epoch": 0.290176, + "grad_norm": 0.859375, + "learning_rate": 7.155645161290323e-05, + "loss": 0.1865, + "step": 18136 + }, + { + "epoch": 0.290192, + "grad_norm": 1.0390625, + "learning_rate": 7.155483870967742e-05, + "loss": 0.1841, + "step": 18137 + }, + { + "epoch": 0.290208, + "grad_norm": 0.7890625, + "learning_rate": 7.155322580645162e-05, + "loss": 0.1757, + "step": 18138 + }, + { + "epoch": 0.290224, + "grad_norm": 1.546875, + "learning_rate": 7.15516129032258e-05, + "loss": 0.1937, + "step": 18139 + }, + { + "epoch": 0.29024, + "grad_norm": 0.9296875, + "learning_rate": 7.155e-05, + "loss": 0.1608, + "step": 18140 + }, + { + "epoch": 0.290256, + "grad_norm": 0.77734375, + "learning_rate": 7.154838709677419e-05, + "loss": 0.2011, + "step": 18141 + }, + { + "epoch": 0.290272, + "grad_norm": 0.86328125, + "learning_rate": 7.154677419354839e-05, + "loss": 0.1491, + "step": 18142 + }, + { + "epoch": 0.290288, + "grad_norm": 0.60546875, + "learning_rate": 7.154516129032258e-05, + "loss": 0.1671, + "step": 18143 + }, + { + "epoch": 0.290304, + "grad_norm": 0.83203125, + "learning_rate": 7.154354838709678e-05, + "loss": 0.1631, + "step": 18144 + }, + { + "epoch": 0.29032, + "grad_norm": 0.92578125, + "learning_rate": 7.154193548387098e-05, + "loss": 0.1602, + "step": 18145 + }, + { + "epoch": 0.290336, + "grad_norm": 0.72265625, + "learning_rate": 7.154032258064518e-05, + "loss": 0.1801, + "step": 18146 + }, + { + "epoch": 0.290352, + "grad_norm": 0.78125, + "learning_rate": 7.153870967741936e-05, + "loss": 0.1872, + "step": 18147 + }, + { + "epoch": 0.290368, + "grad_norm": 0.75390625, + "learning_rate": 7.153709677419356e-05, + "loss": 0.1531, + "step": 18148 + }, + { + "epoch": 0.290384, + "grad_norm": 0.71875, + "learning_rate": 7.153548387096775e-05, + "loss": 0.1997, + "step": 18149 + }, + { + "epoch": 0.2904, + "grad_norm": 0.578125, + "learning_rate": 7.153387096774193e-05, + "loss": 0.1948, + "step": 18150 + }, + { + "epoch": 0.290416, + "grad_norm": 0.796875, + "learning_rate": 7.153225806451613e-05, + "loss": 0.1909, + "step": 18151 + }, + { + "epoch": 0.290432, + "grad_norm": 0.7890625, + "learning_rate": 7.153064516129032e-05, + "loss": 0.1563, + "step": 18152 + }, + { + "epoch": 0.290448, + "grad_norm": 0.81640625, + "learning_rate": 7.152903225806452e-05, + "loss": 0.1587, + "step": 18153 + }, + { + "epoch": 0.290464, + "grad_norm": 0.5625, + "learning_rate": 7.15274193548387e-05, + "loss": 0.1712, + "step": 18154 + }, + { + "epoch": 0.29048, + "grad_norm": 0.85546875, + "learning_rate": 7.15258064516129e-05, + "loss": 0.1868, + "step": 18155 + }, + { + "epoch": 0.290496, + "grad_norm": 1.03125, + "learning_rate": 7.15241935483871e-05, + "loss": 0.1882, + "step": 18156 + }, + { + "epoch": 0.290512, + "grad_norm": 1.5546875, + "learning_rate": 7.15225806451613e-05, + "loss": 0.1793, + "step": 18157 + }, + { + "epoch": 0.290528, + "grad_norm": 0.80078125, + "learning_rate": 7.152096774193549e-05, + "loss": 0.1629, + "step": 18158 + }, + { + "epoch": 0.290544, + "grad_norm": 0.6875, + "learning_rate": 7.151935483870969e-05, + "loss": 0.1504, + "step": 18159 + }, + { + "epoch": 0.29056, + "grad_norm": 0.6796875, + "learning_rate": 7.151774193548388e-05, + "loss": 0.1852, + "step": 18160 + }, + { + "epoch": 0.290576, + "grad_norm": 0.6875, + "learning_rate": 7.151612903225807e-05, + "loss": 0.1651, + "step": 18161 + }, + { + "epoch": 0.290592, + "grad_norm": 1.2421875, + "learning_rate": 7.151451612903226e-05, + "loss": 0.1778, + "step": 18162 + }, + { + "epoch": 0.290608, + "grad_norm": 0.6953125, + "learning_rate": 7.151290322580646e-05, + "loss": 0.1593, + "step": 18163 + }, + { + "epoch": 0.290624, + "grad_norm": 0.96484375, + "learning_rate": 7.151129032258065e-05, + "loss": 0.1679, + "step": 18164 + }, + { + "epoch": 0.29064, + "grad_norm": 0.82421875, + "learning_rate": 7.150967741935483e-05, + "loss": 0.1794, + "step": 18165 + }, + { + "epoch": 0.290656, + "grad_norm": 0.7421875, + "learning_rate": 7.150806451612903e-05, + "loss": 0.1615, + "step": 18166 + }, + { + "epoch": 0.290672, + "grad_norm": 1.4765625, + "learning_rate": 7.150645161290322e-05, + "loss": 0.2258, + "step": 18167 + }, + { + "epoch": 0.290688, + "grad_norm": 0.70703125, + "learning_rate": 7.150483870967742e-05, + "loss": 0.1544, + "step": 18168 + }, + { + "epoch": 0.290704, + "grad_norm": 0.62890625, + "learning_rate": 7.150322580645162e-05, + "loss": 0.2007, + "step": 18169 + }, + { + "epoch": 0.29072, + "grad_norm": 0.60546875, + "learning_rate": 7.150161290322582e-05, + "loss": 0.1583, + "step": 18170 + }, + { + "epoch": 0.290736, + "grad_norm": 0.96484375, + "learning_rate": 7.15e-05, + "loss": 0.1727, + "step": 18171 + }, + { + "epoch": 0.290752, + "grad_norm": 0.7734375, + "learning_rate": 7.14983870967742e-05, + "loss": 0.1513, + "step": 18172 + }, + { + "epoch": 0.290768, + "grad_norm": 0.84375, + "learning_rate": 7.149677419354839e-05, + "loss": 0.2191, + "step": 18173 + }, + { + "epoch": 0.290784, + "grad_norm": 0.8671875, + "learning_rate": 7.149516129032259e-05, + "loss": 0.163, + "step": 18174 + }, + { + "epoch": 0.2908, + "grad_norm": 0.93359375, + "learning_rate": 7.149354838709677e-05, + "loss": 0.1605, + "step": 18175 + }, + { + "epoch": 0.290816, + "grad_norm": 0.828125, + "learning_rate": 7.149193548387097e-05, + "loss": 0.1903, + "step": 18176 + }, + { + "epoch": 0.290832, + "grad_norm": 0.57421875, + "learning_rate": 7.149032258064516e-05, + "loss": 0.1887, + "step": 18177 + }, + { + "epoch": 0.290848, + "grad_norm": 1.1875, + "learning_rate": 7.148870967741936e-05, + "loss": 0.165, + "step": 18178 + }, + { + "epoch": 0.290864, + "grad_norm": 0.75, + "learning_rate": 7.148709677419355e-05, + "loss": 0.1598, + "step": 18179 + }, + { + "epoch": 0.29088, + "grad_norm": 0.91796875, + "learning_rate": 7.148548387096775e-05, + "loss": 0.1712, + "step": 18180 + }, + { + "epoch": 0.290896, + "grad_norm": 0.6015625, + "learning_rate": 7.148387096774195e-05, + "loss": 0.133, + "step": 18181 + }, + { + "epoch": 0.290912, + "grad_norm": 0.6953125, + "learning_rate": 7.148225806451613e-05, + "loss": 0.18, + "step": 18182 + }, + { + "epoch": 0.290928, + "grad_norm": 0.9921875, + "learning_rate": 7.148064516129033e-05, + "loss": 0.1819, + "step": 18183 + }, + { + "epoch": 0.290944, + "grad_norm": 0.890625, + "learning_rate": 7.147903225806452e-05, + "loss": 0.1864, + "step": 18184 + }, + { + "epoch": 0.29096, + "grad_norm": 1.09375, + "learning_rate": 7.147741935483872e-05, + "loss": 0.2048, + "step": 18185 + }, + { + "epoch": 0.290976, + "grad_norm": 0.94140625, + "learning_rate": 7.14758064516129e-05, + "loss": 0.1733, + "step": 18186 + }, + { + "epoch": 0.290992, + "grad_norm": 0.76171875, + "learning_rate": 7.14741935483871e-05, + "loss": 0.1445, + "step": 18187 + }, + { + "epoch": 0.291008, + "grad_norm": 0.8125, + "learning_rate": 7.147258064516129e-05, + "loss": 0.1512, + "step": 18188 + }, + { + "epoch": 0.291024, + "grad_norm": 0.64453125, + "learning_rate": 7.147096774193549e-05, + "loss": 0.1669, + "step": 18189 + }, + { + "epoch": 0.29104, + "grad_norm": 0.60546875, + "learning_rate": 7.146935483870967e-05, + "loss": 0.1695, + "step": 18190 + }, + { + "epoch": 0.291056, + "grad_norm": 0.85546875, + "learning_rate": 7.146774193548387e-05, + "loss": 0.1699, + "step": 18191 + }, + { + "epoch": 0.291072, + "grad_norm": 1.4296875, + "learning_rate": 7.146612903225807e-05, + "loss": 0.2167, + "step": 18192 + }, + { + "epoch": 0.291088, + "grad_norm": 1.2265625, + "learning_rate": 7.146451612903226e-05, + "loss": 0.156, + "step": 18193 + }, + { + "epoch": 0.291104, + "grad_norm": 0.765625, + "learning_rate": 7.146290322580646e-05, + "loss": 0.1573, + "step": 18194 + }, + { + "epoch": 0.29112, + "grad_norm": 0.7421875, + "learning_rate": 7.146129032258066e-05, + "loss": 0.1785, + "step": 18195 + }, + { + "epoch": 0.291136, + "grad_norm": 0.80859375, + "learning_rate": 7.145967741935484e-05, + "loss": 0.1479, + "step": 18196 + }, + { + "epoch": 0.291152, + "grad_norm": 0.71875, + "learning_rate": 7.145806451612903e-05, + "loss": 0.1682, + "step": 18197 + }, + { + "epoch": 0.291168, + "grad_norm": 0.91015625, + "learning_rate": 7.145645161290323e-05, + "loss": 0.1794, + "step": 18198 + }, + { + "epoch": 0.291184, + "grad_norm": 0.69921875, + "learning_rate": 7.145483870967742e-05, + "loss": 0.1524, + "step": 18199 + }, + { + "epoch": 0.2912, + "grad_norm": 0.79296875, + "learning_rate": 7.145322580645162e-05, + "loss": 0.1791, + "step": 18200 + }, + { + "epoch": 0.291216, + "grad_norm": 0.625, + "learning_rate": 7.14516129032258e-05, + "loss": 0.1994, + "step": 18201 + }, + { + "epoch": 0.291232, + "grad_norm": 0.70703125, + "learning_rate": 7.145e-05, + "loss": 0.1488, + "step": 18202 + }, + { + "epoch": 0.291248, + "grad_norm": 0.8515625, + "learning_rate": 7.144838709677419e-05, + "loss": 0.146, + "step": 18203 + }, + { + "epoch": 0.291264, + "grad_norm": 0.6171875, + "learning_rate": 7.144677419354839e-05, + "loss": 0.1737, + "step": 18204 + }, + { + "epoch": 0.29128, + "grad_norm": 0.62109375, + "learning_rate": 7.144516129032259e-05, + "loss": 0.1397, + "step": 18205 + }, + { + "epoch": 0.291296, + "grad_norm": 1.4609375, + "learning_rate": 7.144354838709679e-05, + "loss": 0.2341, + "step": 18206 + }, + { + "epoch": 0.291312, + "grad_norm": 0.828125, + "learning_rate": 7.144193548387097e-05, + "loss": 0.1947, + "step": 18207 + }, + { + "epoch": 0.291328, + "grad_norm": 0.953125, + "learning_rate": 7.144032258064517e-05, + "loss": 0.151, + "step": 18208 + }, + { + "epoch": 0.291344, + "grad_norm": 0.65625, + "learning_rate": 7.143870967741936e-05, + "loss": 0.1495, + "step": 18209 + }, + { + "epoch": 0.29136, + "grad_norm": 0.9453125, + "learning_rate": 7.143709677419356e-05, + "loss": 0.1884, + "step": 18210 + }, + { + "epoch": 0.291376, + "grad_norm": 0.73828125, + "learning_rate": 7.143548387096774e-05, + "loss": 0.1737, + "step": 18211 + }, + { + "epoch": 0.291392, + "grad_norm": 0.66015625, + "learning_rate": 7.143387096774193e-05, + "loss": 0.1936, + "step": 18212 + }, + { + "epoch": 0.291408, + "grad_norm": 0.609375, + "learning_rate": 7.143225806451613e-05, + "loss": 0.1327, + "step": 18213 + }, + { + "epoch": 0.291424, + "grad_norm": 1.875, + "learning_rate": 7.143064516129032e-05, + "loss": 0.1544, + "step": 18214 + }, + { + "epoch": 0.29144, + "grad_norm": 0.9921875, + "learning_rate": 7.142903225806452e-05, + "loss": 0.1937, + "step": 18215 + }, + { + "epoch": 0.291456, + "grad_norm": 0.6015625, + "learning_rate": 7.142741935483872e-05, + "loss": 0.1317, + "step": 18216 + }, + { + "epoch": 0.291472, + "grad_norm": 0.703125, + "learning_rate": 7.142580645161292e-05, + "loss": 0.1313, + "step": 18217 + }, + { + "epoch": 0.291488, + "grad_norm": 1.25, + "learning_rate": 7.14241935483871e-05, + "loss": 0.2308, + "step": 18218 + }, + { + "epoch": 0.291504, + "grad_norm": 0.953125, + "learning_rate": 7.14225806451613e-05, + "loss": 0.1419, + "step": 18219 + }, + { + "epoch": 0.29152, + "grad_norm": 0.75, + "learning_rate": 7.142096774193549e-05, + "loss": 0.1216, + "step": 18220 + }, + { + "epoch": 0.291536, + "grad_norm": 0.74609375, + "learning_rate": 7.141935483870969e-05, + "loss": 0.1435, + "step": 18221 + }, + { + "epoch": 0.291552, + "grad_norm": 1.3359375, + "learning_rate": 7.141774193548387e-05, + "loss": 0.1545, + "step": 18222 + }, + { + "epoch": 0.291568, + "grad_norm": 1.0546875, + "learning_rate": 7.141612903225807e-05, + "loss": 0.1674, + "step": 18223 + }, + { + "epoch": 0.291584, + "grad_norm": 1.015625, + "learning_rate": 7.141451612903226e-05, + "loss": 0.1613, + "step": 18224 + }, + { + "epoch": 0.2916, + "grad_norm": 0.56640625, + "learning_rate": 7.141290322580646e-05, + "loss": 0.1409, + "step": 18225 + }, + { + "epoch": 0.291616, + "grad_norm": 1.2109375, + "learning_rate": 7.141129032258064e-05, + "loss": 0.1749, + "step": 18226 + }, + { + "epoch": 0.291632, + "grad_norm": 1.2890625, + "learning_rate": 7.140967741935483e-05, + "loss": 0.2008, + "step": 18227 + }, + { + "epoch": 0.291648, + "grad_norm": 1.7265625, + "learning_rate": 7.140806451612903e-05, + "loss": 0.1564, + "step": 18228 + }, + { + "epoch": 0.291664, + "grad_norm": 0.640625, + "learning_rate": 7.140645161290323e-05, + "loss": 0.1423, + "step": 18229 + }, + { + "epoch": 0.29168, + "grad_norm": 0.77734375, + "learning_rate": 7.140483870967743e-05, + "loss": 0.2241, + "step": 18230 + }, + { + "epoch": 0.291696, + "grad_norm": 0.7265625, + "learning_rate": 7.140322580645162e-05, + "loss": 0.1534, + "step": 18231 + }, + { + "epoch": 0.291712, + "grad_norm": 0.82421875, + "learning_rate": 7.140161290322581e-05, + "loss": 0.1959, + "step": 18232 + }, + { + "epoch": 0.291728, + "grad_norm": 1.203125, + "learning_rate": 7.14e-05, + "loss": 0.1709, + "step": 18233 + }, + { + "epoch": 0.291744, + "grad_norm": 1.0703125, + "learning_rate": 7.13983870967742e-05, + "loss": 0.1668, + "step": 18234 + }, + { + "epoch": 0.29176, + "grad_norm": 0.59375, + "learning_rate": 7.139677419354839e-05, + "loss": 0.1615, + "step": 18235 + }, + { + "epoch": 0.291776, + "grad_norm": 1.625, + "learning_rate": 7.139516129032259e-05, + "loss": 0.1799, + "step": 18236 + }, + { + "epoch": 0.291792, + "grad_norm": 1.390625, + "learning_rate": 7.139354838709677e-05, + "loss": 0.1782, + "step": 18237 + }, + { + "epoch": 0.291808, + "grad_norm": 0.9296875, + "learning_rate": 7.139193548387097e-05, + "loss": 0.172, + "step": 18238 + }, + { + "epoch": 0.291824, + "grad_norm": 0.78125, + "learning_rate": 7.139032258064516e-05, + "loss": 0.172, + "step": 18239 + }, + { + "epoch": 0.29184, + "grad_norm": 1.6328125, + "learning_rate": 7.138870967741936e-05, + "loss": 0.1554, + "step": 18240 + }, + { + "epoch": 0.291856, + "grad_norm": 1.0234375, + "learning_rate": 7.138709677419356e-05, + "loss": 0.2273, + "step": 18241 + }, + { + "epoch": 0.291872, + "grad_norm": 1.796875, + "learning_rate": 7.138548387096774e-05, + "loss": 0.2072, + "step": 18242 + }, + { + "epoch": 0.291888, + "grad_norm": 0.5703125, + "learning_rate": 7.138387096774194e-05, + "loss": 0.1694, + "step": 18243 + }, + { + "epoch": 0.291904, + "grad_norm": 0.94921875, + "learning_rate": 7.138225806451613e-05, + "loss": 0.1645, + "step": 18244 + }, + { + "epoch": 0.29192, + "grad_norm": 0.8046875, + "learning_rate": 7.138064516129033e-05, + "loss": 0.1632, + "step": 18245 + }, + { + "epoch": 0.291936, + "grad_norm": 1.5859375, + "learning_rate": 7.137903225806451e-05, + "loss": 0.1727, + "step": 18246 + }, + { + "epoch": 0.291952, + "grad_norm": 0.8203125, + "learning_rate": 7.137741935483871e-05, + "loss": 0.1998, + "step": 18247 + }, + { + "epoch": 0.291968, + "grad_norm": 1.359375, + "learning_rate": 7.13758064516129e-05, + "loss": 0.1463, + "step": 18248 + }, + { + "epoch": 0.291984, + "grad_norm": 0.65625, + "learning_rate": 7.13741935483871e-05, + "loss": 0.1932, + "step": 18249 + }, + { + "epoch": 0.292, + "grad_norm": 1.3359375, + "learning_rate": 7.137258064516129e-05, + "loss": 0.2043, + "step": 18250 + }, + { + "epoch": 0.292016, + "grad_norm": 0.75, + "learning_rate": 7.137096774193549e-05, + "loss": 0.1768, + "step": 18251 + }, + { + "epoch": 0.292032, + "grad_norm": 1.1875, + "learning_rate": 7.136935483870969e-05, + "loss": 0.1863, + "step": 18252 + }, + { + "epoch": 0.292048, + "grad_norm": 1.1328125, + "learning_rate": 7.136774193548388e-05, + "loss": 0.1679, + "step": 18253 + }, + { + "epoch": 0.292064, + "grad_norm": 0.6796875, + "learning_rate": 7.136612903225807e-05, + "loss": 0.1366, + "step": 18254 + }, + { + "epoch": 0.29208, + "grad_norm": 0.6796875, + "learning_rate": 7.136451612903227e-05, + "loss": 0.1809, + "step": 18255 + }, + { + "epoch": 0.292096, + "grad_norm": 0.427734375, + "learning_rate": 7.136290322580646e-05, + "loss": 0.1351, + "step": 18256 + }, + { + "epoch": 0.292112, + "grad_norm": 0.66015625, + "learning_rate": 7.136129032258066e-05, + "loss": 0.1919, + "step": 18257 + }, + { + "epoch": 0.292128, + "grad_norm": 0.7578125, + "learning_rate": 7.135967741935484e-05, + "loss": 0.1741, + "step": 18258 + }, + { + "epoch": 0.292144, + "grad_norm": 0.67578125, + "learning_rate": 7.135806451612903e-05, + "loss": 0.1844, + "step": 18259 + }, + { + "epoch": 0.29216, + "grad_norm": 1.2734375, + "learning_rate": 7.135645161290323e-05, + "loss": 0.1481, + "step": 18260 + }, + { + "epoch": 0.292176, + "grad_norm": 1.078125, + "learning_rate": 7.135483870967741e-05, + "loss": 0.1466, + "step": 18261 + }, + { + "epoch": 0.292192, + "grad_norm": 0.8046875, + "learning_rate": 7.135322580645161e-05, + "loss": 0.1392, + "step": 18262 + }, + { + "epoch": 0.292208, + "grad_norm": 0.6875, + "learning_rate": 7.13516129032258e-05, + "loss": 0.2081, + "step": 18263 + }, + { + "epoch": 0.292224, + "grad_norm": 1.1796875, + "learning_rate": 7.135e-05, + "loss": 0.1954, + "step": 18264 + }, + { + "epoch": 0.29224, + "grad_norm": 0.671875, + "learning_rate": 7.13483870967742e-05, + "loss": 0.1317, + "step": 18265 + }, + { + "epoch": 0.292256, + "grad_norm": 0.93359375, + "learning_rate": 7.13467741935484e-05, + "loss": 0.1953, + "step": 18266 + }, + { + "epoch": 0.292272, + "grad_norm": 1.421875, + "learning_rate": 7.134516129032258e-05, + "loss": 0.2016, + "step": 18267 + }, + { + "epoch": 0.292288, + "grad_norm": 0.8203125, + "learning_rate": 7.134354838709678e-05, + "loss": 0.1989, + "step": 18268 + }, + { + "epoch": 0.292304, + "grad_norm": 1.09375, + "learning_rate": 7.134193548387097e-05, + "loss": 0.2095, + "step": 18269 + }, + { + "epoch": 0.29232, + "grad_norm": 0.59765625, + "learning_rate": 7.134032258064517e-05, + "loss": 0.181, + "step": 18270 + }, + { + "epoch": 0.292336, + "grad_norm": 0.734375, + "learning_rate": 7.133870967741936e-05, + "loss": 0.1672, + "step": 18271 + }, + { + "epoch": 0.292352, + "grad_norm": 0.546875, + "learning_rate": 7.133709677419356e-05, + "loss": 0.1641, + "step": 18272 + }, + { + "epoch": 0.292368, + "grad_norm": 0.69921875, + "learning_rate": 7.133548387096774e-05, + "loss": 0.1595, + "step": 18273 + }, + { + "epoch": 0.292384, + "grad_norm": 0.8984375, + "learning_rate": 7.133387096774193e-05, + "loss": 0.1404, + "step": 18274 + }, + { + "epoch": 0.2924, + "grad_norm": 0.73046875, + "learning_rate": 7.133225806451613e-05, + "loss": 0.162, + "step": 18275 + }, + { + "epoch": 0.292416, + "grad_norm": 0.90234375, + "learning_rate": 7.133064516129033e-05, + "loss": 0.156, + "step": 18276 + }, + { + "epoch": 0.292432, + "grad_norm": 0.62890625, + "learning_rate": 7.132903225806453e-05, + "loss": 0.1357, + "step": 18277 + }, + { + "epoch": 0.292448, + "grad_norm": 1.5625, + "learning_rate": 7.132741935483871e-05, + "loss": 0.1971, + "step": 18278 + }, + { + "epoch": 0.292464, + "grad_norm": 1.21875, + "learning_rate": 7.132580645161291e-05, + "loss": 0.1952, + "step": 18279 + }, + { + "epoch": 0.29248, + "grad_norm": 0.73828125, + "learning_rate": 7.13241935483871e-05, + "loss": 0.1699, + "step": 18280 + }, + { + "epoch": 0.292496, + "grad_norm": 0.54296875, + "learning_rate": 7.13225806451613e-05, + "loss": 0.1697, + "step": 18281 + }, + { + "epoch": 0.292512, + "grad_norm": 0.8203125, + "learning_rate": 7.132096774193548e-05, + "loss": 0.1595, + "step": 18282 + }, + { + "epoch": 0.292528, + "grad_norm": 0.91796875, + "learning_rate": 7.131935483870968e-05, + "loss": 0.1721, + "step": 18283 + }, + { + "epoch": 0.292544, + "grad_norm": 0.87109375, + "learning_rate": 7.131774193548387e-05, + "loss": 0.1674, + "step": 18284 + }, + { + "epoch": 0.29256, + "grad_norm": 0.96484375, + "learning_rate": 7.131612903225807e-05, + "loss": 0.1705, + "step": 18285 + }, + { + "epoch": 0.292576, + "grad_norm": 0.7265625, + "learning_rate": 7.131451612903226e-05, + "loss": 0.1714, + "step": 18286 + }, + { + "epoch": 0.292592, + "grad_norm": 0.703125, + "learning_rate": 7.131290322580646e-05, + "loss": 0.1537, + "step": 18287 + }, + { + "epoch": 0.292608, + "grad_norm": 0.84765625, + "learning_rate": 7.131129032258064e-05, + "loss": 0.1981, + "step": 18288 + }, + { + "epoch": 0.292624, + "grad_norm": 1.0390625, + "learning_rate": 7.130967741935484e-05, + "loss": 0.1505, + "step": 18289 + }, + { + "epoch": 0.29264, + "grad_norm": 0.8828125, + "learning_rate": 7.130806451612904e-05, + "loss": 0.1639, + "step": 18290 + }, + { + "epoch": 0.292656, + "grad_norm": 0.73046875, + "learning_rate": 7.130645161290323e-05, + "loss": 0.1793, + "step": 18291 + }, + { + "epoch": 0.292672, + "grad_norm": 0.82421875, + "learning_rate": 7.130483870967743e-05, + "loss": 0.1411, + "step": 18292 + }, + { + "epoch": 0.292688, + "grad_norm": 0.71484375, + "learning_rate": 7.130322580645161e-05, + "loss": 0.1648, + "step": 18293 + }, + { + "epoch": 0.292704, + "grad_norm": 0.69140625, + "learning_rate": 7.130161290322581e-05, + "loss": 0.1535, + "step": 18294 + }, + { + "epoch": 0.29272, + "grad_norm": 0.9921875, + "learning_rate": 7.13e-05, + "loss": 0.2218, + "step": 18295 + }, + { + "epoch": 0.292736, + "grad_norm": 0.65234375, + "learning_rate": 7.12983870967742e-05, + "loss": 0.1766, + "step": 18296 + }, + { + "epoch": 0.292752, + "grad_norm": 1.1328125, + "learning_rate": 7.129677419354838e-05, + "loss": 0.1929, + "step": 18297 + }, + { + "epoch": 0.292768, + "grad_norm": 0.73828125, + "learning_rate": 7.129516129032258e-05, + "loss": 0.1566, + "step": 18298 + }, + { + "epoch": 0.292784, + "grad_norm": 1.2265625, + "learning_rate": 7.129354838709677e-05, + "loss": 0.2205, + "step": 18299 + }, + { + "epoch": 0.2928, + "grad_norm": 0.8671875, + "learning_rate": 7.129193548387097e-05, + "loss": 0.1763, + "step": 18300 + }, + { + "epoch": 0.292816, + "grad_norm": 0.734375, + "learning_rate": 7.129032258064517e-05, + "loss": 0.1623, + "step": 18301 + }, + { + "epoch": 0.292832, + "grad_norm": 0.70703125, + "learning_rate": 7.128870967741937e-05, + "loss": 0.2241, + "step": 18302 + }, + { + "epoch": 0.292848, + "grad_norm": 0.671875, + "learning_rate": 7.128709677419355e-05, + "loss": 0.1908, + "step": 18303 + }, + { + "epoch": 0.292864, + "grad_norm": 0.8359375, + "learning_rate": 7.128548387096775e-05, + "loss": 0.1597, + "step": 18304 + }, + { + "epoch": 0.29288, + "grad_norm": 1.1328125, + "learning_rate": 7.128387096774194e-05, + "loss": 0.1787, + "step": 18305 + }, + { + "epoch": 0.292896, + "grad_norm": 0.69921875, + "learning_rate": 7.128225806451613e-05, + "loss": 0.1806, + "step": 18306 + }, + { + "epoch": 0.292912, + "grad_norm": 0.875, + "learning_rate": 7.128064516129033e-05, + "loss": 0.1528, + "step": 18307 + }, + { + "epoch": 0.292928, + "grad_norm": 1.1328125, + "learning_rate": 7.127903225806451e-05, + "loss": 0.1752, + "step": 18308 + }, + { + "epoch": 0.292944, + "grad_norm": 0.92578125, + "learning_rate": 7.127741935483871e-05, + "loss": 0.2048, + "step": 18309 + }, + { + "epoch": 0.29296, + "grad_norm": 0.83203125, + "learning_rate": 7.12758064516129e-05, + "loss": 0.1846, + "step": 18310 + }, + { + "epoch": 0.292976, + "grad_norm": 0.92578125, + "learning_rate": 7.12741935483871e-05, + "loss": 0.1872, + "step": 18311 + }, + { + "epoch": 0.292992, + "grad_norm": 0.84765625, + "learning_rate": 7.12725806451613e-05, + "loss": 0.1797, + "step": 18312 + }, + { + "epoch": 0.293008, + "grad_norm": 1.0703125, + "learning_rate": 7.12709677419355e-05, + "loss": 0.2054, + "step": 18313 + }, + { + "epoch": 0.293024, + "grad_norm": 0.5546875, + "learning_rate": 7.126935483870968e-05, + "loss": 0.1495, + "step": 18314 + }, + { + "epoch": 0.29304, + "grad_norm": 0.77734375, + "learning_rate": 7.126774193548388e-05, + "loss": 0.2059, + "step": 18315 + }, + { + "epoch": 0.293056, + "grad_norm": 0.74609375, + "learning_rate": 7.126612903225807e-05, + "loss": 0.172, + "step": 18316 + }, + { + "epoch": 0.293072, + "grad_norm": 0.71875, + "learning_rate": 7.126451612903227e-05, + "loss": 0.1304, + "step": 18317 + }, + { + "epoch": 0.293088, + "grad_norm": 1.3046875, + "learning_rate": 7.126290322580645e-05, + "loss": 0.1944, + "step": 18318 + }, + { + "epoch": 0.293104, + "grad_norm": 0.94140625, + "learning_rate": 7.126129032258065e-05, + "loss": 0.1764, + "step": 18319 + }, + { + "epoch": 0.29312, + "grad_norm": 0.71484375, + "learning_rate": 7.125967741935484e-05, + "loss": 0.1931, + "step": 18320 + }, + { + "epoch": 0.293136, + "grad_norm": 0.9375, + "learning_rate": 7.125806451612903e-05, + "loss": 0.1357, + "step": 18321 + }, + { + "epoch": 0.293152, + "grad_norm": 1.3046875, + "learning_rate": 7.125645161290323e-05, + "loss": 0.1848, + "step": 18322 + }, + { + "epoch": 0.293168, + "grad_norm": 1.1171875, + "learning_rate": 7.125483870967741e-05, + "loss": 0.182, + "step": 18323 + }, + { + "epoch": 0.293184, + "grad_norm": 0.859375, + "learning_rate": 7.125322580645161e-05, + "loss": 0.205, + "step": 18324 + }, + { + "epoch": 0.2932, + "grad_norm": 0.5625, + "learning_rate": 7.125161290322581e-05, + "loss": 0.1314, + "step": 18325 + }, + { + "epoch": 0.293216, + "grad_norm": 0.83984375, + "learning_rate": 7.125000000000001e-05, + "loss": 0.1755, + "step": 18326 + }, + { + "epoch": 0.293232, + "grad_norm": 0.63671875, + "learning_rate": 7.12483870967742e-05, + "loss": 0.1819, + "step": 18327 + }, + { + "epoch": 0.293248, + "grad_norm": 0.76953125, + "learning_rate": 7.12467741935484e-05, + "loss": 0.1734, + "step": 18328 + }, + { + "epoch": 0.293264, + "grad_norm": 1.0, + "learning_rate": 7.124516129032258e-05, + "loss": 0.1672, + "step": 18329 + }, + { + "epoch": 0.29328, + "grad_norm": 0.95703125, + "learning_rate": 7.124354838709678e-05, + "loss": 0.1786, + "step": 18330 + }, + { + "epoch": 0.293296, + "grad_norm": 0.73046875, + "learning_rate": 7.124193548387097e-05, + "loss": 0.1769, + "step": 18331 + }, + { + "epoch": 0.293312, + "grad_norm": 0.99609375, + "learning_rate": 7.124032258064517e-05, + "loss": 0.2191, + "step": 18332 + }, + { + "epoch": 0.293328, + "grad_norm": 0.9375, + "learning_rate": 7.123870967741935e-05, + "loss": 0.1863, + "step": 18333 + }, + { + "epoch": 0.293344, + "grad_norm": 1.28125, + "learning_rate": 7.123709677419355e-05, + "loss": 0.1731, + "step": 18334 + }, + { + "epoch": 0.29336, + "grad_norm": 0.71875, + "learning_rate": 7.123548387096774e-05, + "loss": 0.1599, + "step": 18335 + }, + { + "epoch": 0.293376, + "grad_norm": 0.7109375, + "learning_rate": 7.123387096774194e-05, + "loss": 0.1913, + "step": 18336 + }, + { + "epoch": 0.293392, + "grad_norm": 0.546875, + "learning_rate": 7.123225806451614e-05, + "loss": 0.1537, + "step": 18337 + }, + { + "epoch": 0.293408, + "grad_norm": 0.703125, + "learning_rate": 7.123064516129032e-05, + "loss": 0.1357, + "step": 18338 + }, + { + "epoch": 0.293424, + "grad_norm": 0.51171875, + "learning_rate": 7.122903225806452e-05, + "loss": 0.189, + "step": 18339 + }, + { + "epoch": 0.29344, + "grad_norm": 0.65625, + "learning_rate": 7.122741935483871e-05, + "loss": 0.1782, + "step": 18340 + }, + { + "epoch": 0.293456, + "grad_norm": 1.0546875, + "learning_rate": 7.122580645161291e-05, + "loss": 0.1795, + "step": 18341 + }, + { + "epoch": 0.293472, + "grad_norm": 1.078125, + "learning_rate": 7.12241935483871e-05, + "loss": 0.1927, + "step": 18342 + }, + { + "epoch": 0.293488, + "grad_norm": 0.734375, + "learning_rate": 7.12225806451613e-05, + "loss": 0.1698, + "step": 18343 + }, + { + "epoch": 0.293504, + "grad_norm": 0.7265625, + "learning_rate": 7.122096774193548e-05, + "loss": 0.1916, + "step": 18344 + }, + { + "epoch": 0.29352, + "grad_norm": 1.1640625, + "learning_rate": 7.121935483870968e-05, + "loss": 0.1875, + "step": 18345 + }, + { + "epoch": 0.293536, + "grad_norm": 1.234375, + "learning_rate": 7.121774193548387e-05, + "loss": 0.1742, + "step": 18346 + }, + { + "epoch": 0.293552, + "grad_norm": 0.87890625, + "learning_rate": 7.121612903225807e-05, + "loss": 0.1543, + "step": 18347 + }, + { + "epoch": 0.293568, + "grad_norm": 0.75390625, + "learning_rate": 7.121451612903227e-05, + "loss": 0.1394, + "step": 18348 + }, + { + "epoch": 0.293584, + "grad_norm": 0.55859375, + "learning_rate": 7.121290322580647e-05, + "loss": 0.151, + "step": 18349 + }, + { + "epoch": 0.2936, + "grad_norm": 0.84765625, + "learning_rate": 7.121129032258065e-05, + "loss": 0.204, + "step": 18350 + }, + { + "epoch": 0.293616, + "grad_norm": 0.5546875, + "learning_rate": 7.120967741935484e-05, + "loss": 0.1535, + "step": 18351 + }, + { + "epoch": 0.293632, + "grad_norm": 0.5859375, + "learning_rate": 7.120806451612904e-05, + "loss": 0.1671, + "step": 18352 + }, + { + "epoch": 0.293648, + "grad_norm": 0.79296875, + "learning_rate": 7.120645161290322e-05, + "loss": 0.1567, + "step": 18353 + }, + { + "epoch": 0.293664, + "grad_norm": 0.58984375, + "learning_rate": 7.120483870967742e-05, + "loss": 0.1329, + "step": 18354 + }, + { + "epoch": 0.29368, + "grad_norm": 0.9375, + "learning_rate": 7.120322580645161e-05, + "loss": 0.1855, + "step": 18355 + }, + { + "epoch": 0.293696, + "grad_norm": 1.2109375, + "learning_rate": 7.120161290322581e-05, + "loss": 0.1429, + "step": 18356 + }, + { + "epoch": 0.293712, + "grad_norm": 0.96875, + "learning_rate": 7.12e-05, + "loss": 0.1794, + "step": 18357 + }, + { + "epoch": 0.293728, + "grad_norm": 0.87890625, + "learning_rate": 7.11983870967742e-05, + "loss": 0.2047, + "step": 18358 + }, + { + "epoch": 0.293744, + "grad_norm": 0.73046875, + "learning_rate": 7.119677419354838e-05, + "loss": 0.1586, + "step": 18359 + }, + { + "epoch": 0.29376, + "grad_norm": 0.59765625, + "learning_rate": 7.119516129032258e-05, + "loss": 0.1539, + "step": 18360 + }, + { + "epoch": 0.293776, + "grad_norm": 0.65234375, + "learning_rate": 7.119354838709678e-05, + "loss": 0.1645, + "step": 18361 + }, + { + "epoch": 0.293792, + "grad_norm": 0.625, + "learning_rate": 7.119193548387098e-05, + "loss": 0.1902, + "step": 18362 + }, + { + "epoch": 0.293808, + "grad_norm": 0.84765625, + "learning_rate": 7.119032258064517e-05, + "loss": 0.1923, + "step": 18363 + }, + { + "epoch": 0.293824, + "grad_norm": 0.53125, + "learning_rate": 7.118870967741937e-05, + "loss": 0.1686, + "step": 18364 + }, + { + "epoch": 0.29384, + "grad_norm": 0.71875, + "learning_rate": 7.118709677419355e-05, + "loss": 0.158, + "step": 18365 + }, + { + "epoch": 0.293856, + "grad_norm": 0.78125, + "learning_rate": 7.118548387096775e-05, + "loss": 0.1624, + "step": 18366 + }, + { + "epoch": 0.293872, + "grad_norm": 0.69140625, + "learning_rate": 7.118387096774194e-05, + "loss": 0.1412, + "step": 18367 + }, + { + "epoch": 0.293888, + "grad_norm": 1.0625, + "learning_rate": 7.118225806451612e-05, + "loss": 0.1453, + "step": 18368 + }, + { + "epoch": 0.293904, + "grad_norm": 1.0, + "learning_rate": 7.118064516129032e-05, + "loss": 0.2033, + "step": 18369 + }, + { + "epoch": 0.29392, + "grad_norm": 0.62109375, + "learning_rate": 7.117903225806451e-05, + "loss": 0.1504, + "step": 18370 + }, + { + "epoch": 0.293936, + "grad_norm": 0.9375, + "learning_rate": 7.117741935483871e-05, + "loss": 0.1851, + "step": 18371 + }, + { + "epoch": 0.293952, + "grad_norm": 0.66796875, + "learning_rate": 7.117580645161291e-05, + "loss": 0.1309, + "step": 18372 + }, + { + "epoch": 0.293968, + "grad_norm": 0.81640625, + "learning_rate": 7.117419354838711e-05, + "loss": 0.1931, + "step": 18373 + }, + { + "epoch": 0.293984, + "grad_norm": 0.81640625, + "learning_rate": 7.11725806451613e-05, + "loss": 0.1978, + "step": 18374 + }, + { + "epoch": 0.294, + "grad_norm": 0.55078125, + "learning_rate": 7.11709677419355e-05, + "loss": 0.149, + "step": 18375 + }, + { + "epoch": 0.294016, + "grad_norm": 0.9375, + "learning_rate": 7.116935483870968e-05, + "loss": 0.1497, + "step": 18376 + }, + { + "epoch": 0.294032, + "grad_norm": 0.7421875, + "learning_rate": 7.116774193548388e-05, + "loss": 0.1685, + "step": 18377 + }, + { + "epoch": 0.294048, + "grad_norm": 0.76171875, + "learning_rate": 7.116612903225807e-05, + "loss": 0.1543, + "step": 18378 + }, + { + "epoch": 0.294064, + "grad_norm": 0.98828125, + "learning_rate": 7.116451612903227e-05, + "loss": 0.175, + "step": 18379 + }, + { + "epoch": 0.29408, + "grad_norm": 1.1171875, + "learning_rate": 7.116290322580645e-05, + "loss": 0.1399, + "step": 18380 + }, + { + "epoch": 0.294096, + "grad_norm": 0.88671875, + "learning_rate": 7.116129032258065e-05, + "loss": 0.1834, + "step": 18381 + }, + { + "epoch": 0.294112, + "grad_norm": 1.25, + "learning_rate": 7.115967741935484e-05, + "loss": 0.19, + "step": 18382 + }, + { + "epoch": 0.294128, + "grad_norm": 1.203125, + "learning_rate": 7.115806451612904e-05, + "loss": 0.185, + "step": 18383 + }, + { + "epoch": 0.294144, + "grad_norm": 0.640625, + "learning_rate": 7.115645161290322e-05, + "loss": 0.1947, + "step": 18384 + }, + { + "epoch": 0.29416, + "grad_norm": 0.9453125, + "learning_rate": 7.115483870967742e-05, + "loss": 0.2206, + "step": 18385 + }, + { + "epoch": 0.294176, + "grad_norm": 0.734375, + "learning_rate": 7.115322580645162e-05, + "loss": 0.158, + "step": 18386 + }, + { + "epoch": 0.294192, + "grad_norm": 1.359375, + "learning_rate": 7.115161290322581e-05, + "loss": 0.1743, + "step": 18387 + }, + { + "epoch": 0.294208, + "grad_norm": 0.6953125, + "learning_rate": 7.115000000000001e-05, + "loss": 0.1709, + "step": 18388 + }, + { + "epoch": 0.294224, + "grad_norm": 1.3359375, + "learning_rate": 7.11483870967742e-05, + "loss": 0.2089, + "step": 18389 + }, + { + "epoch": 0.29424, + "grad_norm": 0.953125, + "learning_rate": 7.11467741935484e-05, + "loss": 0.1441, + "step": 18390 + }, + { + "epoch": 0.294256, + "grad_norm": 0.470703125, + "learning_rate": 7.114516129032258e-05, + "loss": 0.1425, + "step": 18391 + }, + { + "epoch": 0.294272, + "grad_norm": 0.77734375, + "learning_rate": 7.114354838709678e-05, + "loss": 0.1625, + "step": 18392 + }, + { + "epoch": 0.294288, + "grad_norm": 0.75, + "learning_rate": 7.114193548387097e-05, + "loss": 0.1622, + "step": 18393 + }, + { + "epoch": 0.294304, + "grad_norm": 1.03125, + "learning_rate": 7.114032258064517e-05, + "loss": 0.209, + "step": 18394 + }, + { + "epoch": 0.29432, + "grad_norm": 0.87109375, + "learning_rate": 7.113870967741935e-05, + "loss": 0.1566, + "step": 18395 + }, + { + "epoch": 0.294336, + "grad_norm": 0.85546875, + "learning_rate": 7.113709677419355e-05, + "loss": 0.1699, + "step": 18396 + }, + { + "epoch": 0.294352, + "grad_norm": 0.640625, + "learning_rate": 7.113548387096775e-05, + "loss": 0.1823, + "step": 18397 + }, + { + "epoch": 0.294368, + "grad_norm": 0.7265625, + "learning_rate": 7.113387096774194e-05, + "loss": 0.1728, + "step": 18398 + }, + { + "epoch": 0.294384, + "grad_norm": 0.67578125, + "learning_rate": 7.113225806451614e-05, + "loss": 0.1846, + "step": 18399 + }, + { + "epoch": 0.2944, + "grad_norm": 0.83984375, + "learning_rate": 7.113064516129032e-05, + "loss": 0.1978, + "step": 18400 + }, + { + "epoch": 0.294416, + "grad_norm": 0.8515625, + "learning_rate": 7.112903225806452e-05, + "loss": 0.1285, + "step": 18401 + }, + { + "epoch": 0.294432, + "grad_norm": 0.703125, + "learning_rate": 7.112741935483871e-05, + "loss": 0.1482, + "step": 18402 + }, + { + "epoch": 0.294448, + "grad_norm": 0.7265625, + "learning_rate": 7.112580645161291e-05, + "loss": 0.1851, + "step": 18403 + }, + { + "epoch": 0.294464, + "grad_norm": 1.0390625, + "learning_rate": 7.11241935483871e-05, + "loss": 0.152, + "step": 18404 + }, + { + "epoch": 0.29448, + "grad_norm": 0.71484375, + "learning_rate": 7.11225806451613e-05, + "loss": 0.1613, + "step": 18405 + }, + { + "epoch": 0.294496, + "grad_norm": 0.62890625, + "learning_rate": 7.112096774193548e-05, + "loss": 0.1551, + "step": 18406 + }, + { + "epoch": 0.294512, + "grad_norm": 0.640625, + "learning_rate": 7.111935483870968e-05, + "loss": 0.1423, + "step": 18407 + }, + { + "epoch": 0.294528, + "grad_norm": 0.6640625, + "learning_rate": 7.111774193548388e-05, + "loss": 0.1425, + "step": 18408 + }, + { + "epoch": 0.294544, + "grad_norm": 0.62890625, + "learning_rate": 7.111612903225808e-05, + "loss": 0.1416, + "step": 18409 + }, + { + "epoch": 0.29456, + "grad_norm": 0.7890625, + "learning_rate": 7.111451612903226e-05, + "loss": 0.1486, + "step": 18410 + }, + { + "epoch": 0.294576, + "grad_norm": 0.7109375, + "learning_rate": 7.111290322580646e-05, + "loss": 0.1955, + "step": 18411 + }, + { + "epoch": 0.294592, + "grad_norm": 0.5859375, + "learning_rate": 7.111129032258065e-05, + "loss": 0.1313, + "step": 18412 + }, + { + "epoch": 0.294608, + "grad_norm": 0.6015625, + "learning_rate": 7.110967741935485e-05, + "loss": 0.1533, + "step": 18413 + }, + { + "epoch": 0.294624, + "grad_norm": 0.9140625, + "learning_rate": 7.110806451612904e-05, + "loss": 0.1613, + "step": 18414 + }, + { + "epoch": 0.29464, + "grad_norm": 1.09375, + "learning_rate": 7.110645161290322e-05, + "loss": 0.1939, + "step": 18415 + }, + { + "epoch": 0.294656, + "grad_norm": 1.046875, + "learning_rate": 7.110483870967742e-05, + "loss": 0.1703, + "step": 18416 + }, + { + "epoch": 0.294672, + "grad_norm": 0.7734375, + "learning_rate": 7.110322580645161e-05, + "loss": 0.1891, + "step": 18417 + }, + { + "epoch": 0.294688, + "grad_norm": 0.6640625, + "learning_rate": 7.110161290322581e-05, + "loss": 0.1863, + "step": 18418 + }, + { + "epoch": 0.294704, + "grad_norm": 0.62109375, + "learning_rate": 7.11e-05, + "loss": 0.1348, + "step": 18419 + }, + { + "epoch": 0.29472, + "grad_norm": 0.93359375, + "learning_rate": 7.109838709677419e-05, + "loss": 0.1972, + "step": 18420 + }, + { + "epoch": 0.294736, + "grad_norm": 1.046875, + "learning_rate": 7.109677419354839e-05, + "loss": 0.1626, + "step": 18421 + }, + { + "epoch": 0.294752, + "grad_norm": 0.9609375, + "learning_rate": 7.109516129032259e-05, + "loss": 0.1782, + "step": 18422 + }, + { + "epoch": 0.294768, + "grad_norm": 1.15625, + "learning_rate": 7.109354838709678e-05, + "loss": 0.1477, + "step": 18423 + }, + { + "epoch": 0.294784, + "grad_norm": 0.86328125, + "learning_rate": 7.109193548387098e-05, + "loss": 0.198, + "step": 18424 + }, + { + "epoch": 0.2948, + "grad_norm": 1.203125, + "learning_rate": 7.109032258064516e-05, + "loss": 0.1805, + "step": 18425 + }, + { + "epoch": 0.294816, + "grad_norm": 0.875, + "learning_rate": 7.108870967741936e-05, + "loss": 0.14, + "step": 18426 + }, + { + "epoch": 0.294832, + "grad_norm": 1.4296875, + "learning_rate": 7.108709677419355e-05, + "loss": 0.2291, + "step": 18427 + }, + { + "epoch": 0.294848, + "grad_norm": 0.5859375, + "learning_rate": 7.108548387096775e-05, + "loss": 0.1678, + "step": 18428 + }, + { + "epoch": 0.294864, + "grad_norm": 0.7421875, + "learning_rate": 7.108387096774194e-05, + "loss": 0.1965, + "step": 18429 + }, + { + "epoch": 0.29488, + "grad_norm": 0.7734375, + "learning_rate": 7.108225806451612e-05, + "loss": 0.1775, + "step": 18430 + }, + { + "epoch": 0.294896, + "grad_norm": 0.96875, + "learning_rate": 7.108064516129032e-05, + "loss": 0.1942, + "step": 18431 + }, + { + "epoch": 0.294912, + "grad_norm": 0.75390625, + "learning_rate": 7.107903225806452e-05, + "loss": 0.1638, + "step": 18432 + }, + { + "epoch": 0.294928, + "grad_norm": 0.796875, + "learning_rate": 7.107741935483872e-05, + "loss": 0.1797, + "step": 18433 + }, + { + "epoch": 0.294944, + "grad_norm": 0.75390625, + "learning_rate": 7.10758064516129e-05, + "loss": 0.1737, + "step": 18434 + }, + { + "epoch": 0.29496, + "grad_norm": 0.8515625, + "learning_rate": 7.10741935483871e-05, + "loss": 0.1614, + "step": 18435 + }, + { + "epoch": 0.294976, + "grad_norm": 0.98046875, + "learning_rate": 7.107258064516129e-05, + "loss": 0.1975, + "step": 18436 + }, + { + "epoch": 0.294992, + "grad_norm": 1.09375, + "learning_rate": 7.107096774193549e-05, + "loss": 0.1249, + "step": 18437 + }, + { + "epoch": 0.295008, + "grad_norm": 0.625, + "learning_rate": 7.106935483870968e-05, + "loss": 0.1649, + "step": 18438 + }, + { + "epoch": 0.295024, + "grad_norm": 0.67578125, + "learning_rate": 7.106774193548388e-05, + "loss": 0.1727, + "step": 18439 + }, + { + "epoch": 0.29504, + "grad_norm": 0.875, + "learning_rate": 7.106612903225806e-05, + "loss": 0.1713, + "step": 18440 + }, + { + "epoch": 0.295056, + "grad_norm": 0.79296875, + "learning_rate": 7.106451612903226e-05, + "loss": 0.1883, + "step": 18441 + }, + { + "epoch": 0.295072, + "grad_norm": 0.94921875, + "learning_rate": 7.106290322580645e-05, + "loss": 0.1655, + "step": 18442 + }, + { + "epoch": 0.295088, + "grad_norm": 0.59765625, + "learning_rate": 7.106129032258065e-05, + "loss": 0.139, + "step": 18443 + }, + { + "epoch": 0.295104, + "grad_norm": 1.0703125, + "learning_rate": 7.105967741935485e-05, + "loss": 0.2245, + "step": 18444 + }, + { + "epoch": 0.29512, + "grad_norm": 0.66015625, + "learning_rate": 7.105806451612903e-05, + "loss": 0.2088, + "step": 18445 + }, + { + "epoch": 0.295136, + "grad_norm": 1.0625, + "learning_rate": 7.105645161290323e-05, + "loss": 0.1885, + "step": 18446 + }, + { + "epoch": 0.295152, + "grad_norm": 0.5234375, + "learning_rate": 7.105483870967742e-05, + "loss": 0.1535, + "step": 18447 + }, + { + "epoch": 0.295168, + "grad_norm": 0.74609375, + "learning_rate": 7.105322580645162e-05, + "loss": 0.1378, + "step": 18448 + }, + { + "epoch": 0.295184, + "grad_norm": 0.671875, + "learning_rate": 7.10516129032258e-05, + "loss": 0.1405, + "step": 18449 + }, + { + "epoch": 0.2952, + "grad_norm": 0.68359375, + "learning_rate": 7.105e-05, + "loss": 0.1925, + "step": 18450 + }, + { + "epoch": 0.295216, + "grad_norm": 0.625, + "learning_rate": 7.104838709677419e-05, + "loss": 0.1364, + "step": 18451 + }, + { + "epoch": 0.295232, + "grad_norm": 0.8125, + "learning_rate": 7.104677419354839e-05, + "loss": 0.1782, + "step": 18452 + }, + { + "epoch": 0.295248, + "grad_norm": 0.921875, + "learning_rate": 7.104516129032258e-05, + "loss": 0.1724, + "step": 18453 + }, + { + "epoch": 0.295264, + "grad_norm": 1.234375, + "learning_rate": 7.104354838709678e-05, + "loss": 0.2078, + "step": 18454 + }, + { + "epoch": 0.29528, + "grad_norm": 0.875, + "learning_rate": 7.104193548387096e-05, + "loss": 0.1968, + "step": 18455 + }, + { + "epoch": 0.295296, + "grad_norm": 0.90234375, + "learning_rate": 7.104032258064516e-05, + "loss": 0.1727, + "step": 18456 + }, + { + "epoch": 0.295312, + "grad_norm": 1.375, + "learning_rate": 7.103870967741936e-05, + "loss": 0.2045, + "step": 18457 + }, + { + "epoch": 0.295328, + "grad_norm": 0.65625, + "learning_rate": 7.103709677419356e-05, + "loss": 0.1472, + "step": 18458 + }, + { + "epoch": 0.295344, + "grad_norm": 0.80078125, + "learning_rate": 7.103548387096775e-05, + "loss": 0.1423, + "step": 18459 + }, + { + "epoch": 0.29536, + "grad_norm": 0.72265625, + "learning_rate": 7.103387096774193e-05, + "loss": 0.1656, + "step": 18460 + }, + { + "epoch": 0.295376, + "grad_norm": 0.734375, + "learning_rate": 7.103225806451613e-05, + "loss": 0.1858, + "step": 18461 + }, + { + "epoch": 0.295392, + "grad_norm": 0.92578125, + "learning_rate": 7.103064516129032e-05, + "loss": 0.1792, + "step": 18462 + }, + { + "epoch": 0.295408, + "grad_norm": 0.8125, + "learning_rate": 7.102903225806452e-05, + "loss": 0.1662, + "step": 18463 + }, + { + "epoch": 0.295424, + "grad_norm": 0.74609375, + "learning_rate": 7.10274193548387e-05, + "loss": 0.1819, + "step": 18464 + }, + { + "epoch": 0.29544, + "grad_norm": 0.60546875, + "learning_rate": 7.10258064516129e-05, + "loss": 0.1496, + "step": 18465 + }, + { + "epoch": 0.295456, + "grad_norm": 1.3828125, + "learning_rate": 7.102419354838709e-05, + "loss": 0.1808, + "step": 18466 + }, + { + "epoch": 0.295472, + "grad_norm": 0.77734375, + "learning_rate": 7.102258064516129e-05, + "loss": 0.1765, + "step": 18467 + }, + { + "epoch": 0.295488, + "grad_norm": 0.6015625, + "learning_rate": 7.102096774193549e-05, + "loss": 0.1572, + "step": 18468 + }, + { + "epoch": 0.295504, + "grad_norm": 1.0546875, + "learning_rate": 7.101935483870969e-05, + "loss": 0.1612, + "step": 18469 + }, + { + "epoch": 0.29552, + "grad_norm": 0.828125, + "learning_rate": 7.101774193548388e-05, + "loss": 0.158, + "step": 18470 + }, + { + "epoch": 0.295536, + "grad_norm": 1.2109375, + "learning_rate": 7.101612903225808e-05, + "loss": 0.1794, + "step": 18471 + }, + { + "epoch": 0.295552, + "grad_norm": 0.6171875, + "learning_rate": 7.101451612903226e-05, + "loss": 0.1643, + "step": 18472 + }, + { + "epoch": 0.295568, + "grad_norm": 1.0, + "learning_rate": 7.101290322580646e-05, + "loss": 0.1653, + "step": 18473 + }, + { + "epoch": 0.295584, + "grad_norm": 0.70703125, + "learning_rate": 7.101129032258065e-05, + "loss": 0.1782, + "step": 18474 + }, + { + "epoch": 0.2956, + "grad_norm": 0.625, + "learning_rate": 7.100967741935485e-05, + "loss": 0.1654, + "step": 18475 + }, + { + "epoch": 0.295616, + "grad_norm": 1.2734375, + "learning_rate": 7.100806451612903e-05, + "loss": 0.1873, + "step": 18476 + }, + { + "epoch": 0.295632, + "grad_norm": 0.71875, + "learning_rate": 7.100645161290322e-05, + "loss": 0.1892, + "step": 18477 + }, + { + "epoch": 0.295648, + "grad_norm": 0.56640625, + "learning_rate": 7.100483870967742e-05, + "loss": 0.1457, + "step": 18478 + }, + { + "epoch": 0.295664, + "grad_norm": 0.65625, + "learning_rate": 7.10032258064516e-05, + "loss": 0.1777, + "step": 18479 + }, + { + "epoch": 0.29568, + "grad_norm": 0.8359375, + "learning_rate": 7.10016129032258e-05, + "loss": 0.1242, + "step": 18480 + }, + { + "epoch": 0.295696, + "grad_norm": 0.80078125, + "learning_rate": 7.1e-05, + "loss": 0.1628, + "step": 18481 + }, + { + "epoch": 0.295712, + "grad_norm": 0.84765625, + "learning_rate": 7.09983870967742e-05, + "loss": 0.1696, + "step": 18482 + }, + { + "epoch": 0.295728, + "grad_norm": 1.4453125, + "learning_rate": 7.099677419354839e-05, + "loss": 0.2041, + "step": 18483 + }, + { + "epoch": 0.295744, + "grad_norm": 1.234375, + "learning_rate": 7.099516129032259e-05, + "loss": 0.1246, + "step": 18484 + }, + { + "epoch": 0.29576, + "grad_norm": 0.8828125, + "learning_rate": 7.099354838709678e-05, + "loss": 0.2313, + "step": 18485 + }, + { + "epoch": 0.295776, + "grad_norm": 1.359375, + "learning_rate": 7.099193548387098e-05, + "loss": 0.2025, + "step": 18486 + }, + { + "epoch": 0.295792, + "grad_norm": 0.828125, + "learning_rate": 7.099032258064516e-05, + "loss": 0.1445, + "step": 18487 + }, + { + "epoch": 0.295808, + "grad_norm": 0.46875, + "learning_rate": 7.098870967741936e-05, + "loss": 0.1511, + "step": 18488 + }, + { + "epoch": 0.295824, + "grad_norm": 0.94921875, + "learning_rate": 7.098709677419355e-05, + "loss": 0.1766, + "step": 18489 + }, + { + "epoch": 0.29584, + "grad_norm": 0.6328125, + "learning_rate": 7.098548387096775e-05, + "loss": 0.1453, + "step": 18490 + }, + { + "epoch": 0.295856, + "grad_norm": 0.74609375, + "learning_rate": 7.098387096774193e-05, + "loss": 0.1779, + "step": 18491 + }, + { + "epoch": 0.295872, + "grad_norm": 0.8828125, + "learning_rate": 7.098225806451613e-05, + "loss": 0.1481, + "step": 18492 + }, + { + "epoch": 0.295888, + "grad_norm": 1.046875, + "learning_rate": 7.098064516129033e-05, + "loss": 0.1429, + "step": 18493 + }, + { + "epoch": 0.295904, + "grad_norm": 0.94921875, + "learning_rate": 7.097903225806452e-05, + "loss": 0.1743, + "step": 18494 + }, + { + "epoch": 0.29592, + "grad_norm": 0.77734375, + "learning_rate": 7.097741935483872e-05, + "loss": 0.1953, + "step": 18495 + }, + { + "epoch": 0.295936, + "grad_norm": 0.69140625, + "learning_rate": 7.09758064516129e-05, + "loss": 0.1623, + "step": 18496 + }, + { + "epoch": 0.295952, + "grad_norm": 1.3046875, + "learning_rate": 7.09741935483871e-05, + "loss": 0.1947, + "step": 18497 + }, + { + "epoch": 0.295968, + "grad_norm": 0.90625, + "learning_rate": 7.097258064516129e-05, + "loss": 0.1566, + "step": 18498 + }, + { + "epoch": 0.295984, + "grad_norm": 0.609375, + "learning_rate": 7.097096774193549e-05, + "loss": 0.1749, + "step": 18499 + }, + { + "epoch": 0.296, + "grad_norm": 0.6875, + "learning_rate": 7.096935483870968e-05, + "loss": 0.165, + "step": 18500 + }, + { + "epoch": 0.296016, + "grad_norm": 1.234375, + "learning_rate": 7.096774193548388e-05, + "loss": 0.1671, + "step": 18501 + }, + { + "epoch": 0.296032, + "grad_norm": 1.1875, + "learning_rate": 7.096612903225806e-05, + "loss": 0.2053, + "step": 18502 + }, + { + "epoch": 0.296048, + "grad_norm": 0.65234375, + "learning_rate": 7.096451612903226e-05, + "loss": 0.2048, + "step": 18503 + }, + { + "epoch": 0.296064, + "grad_norm": 0.703125, + "learning_rate": 7.096290322580646e-05, + "loss": 0.1551, + "step": 18504 + }, + { + "epoch": 0.29608, + "grad_norm": 0.6953125, + "learning_rate": 7.096129032258066e-05, + "loss": 0.1596, + "step": 18505 + }, + { + "epoch": 0.296096, + "grad_norm": 0.77734375, + "learning_rate": 7.095967741935485e-05, + "loss": 0.1858, + "step": 18506 + }, + { + "epoch": 0.296112, + "grad_norm": 0.7109375, + "learning_rate": 7.095806451612903e-05, + "loss": 0.1581, + "step": 18507 + }, + { + "epoch": 0.296128, + "grad_norm": 1.0, + "learning_rate": 7.095645161290323e-05, + "loss": 0.1899, + "step": 18508 + }, + { + "epoch": 0.296144, + "grad_norm": 0.9765625, + "learning_rate": 7.095483870967742e-05, + "loss": 0.1998, + "step": 18509 + }, + { + "epoch": 0.29616, + "grad_norm": 1.0234375, + "learning_rate": 7.095322580645162e-05, + "loss": 0.1553, + "step": 18510 + }, + { + "epoch": 0.296176, + "grad_norm": 0.9765625, + "learning_rate": 7.09516129032258e-05, + "loss": 0.1914, + "step": 18511 + }, + { + "epoch": 0.296192, + "grad_norm": 0.734375, + "learning_rate": 7.095e-05, + "loss": 0.1605, + "step": 18512 + }, + { + "epoch": 0.296208, + "grad_norm": 0.7265625, + "learning_rate": 7.094838709677419e-05, + "loss": 0.17, + "step": 18513 + }, + { + "epoch": 0.296224, + "grad_norm": 1.0078125, + "learning_rate": 7.094677419354839e-05, + "loss": 0.183, + "step": 18514 + }, + { + "epoch": 0.29624, + "grad_norm": 0.75, + "learning_rate": 7.094516129032258e-05, + "loss": 0.1479, + "step": 18515 + }, + { + "epoch": 0.296256, + "grad_norm": 0.88671875, + "learning_rate": 7.094354838709677e-05, + "loss": 0.1636, + "step": 18516 + }, + { + "epoch": 0.296272, + "grad_norm": 0.984375, + "learning_rate": 7.094193548387097e-05, + "loss": 0.1948, + "step": 18517 + }, + { + "epoch": 0.296288, + "grad_norm": 1.34375, + "learning_rate": 7.094032258064517e-05, + "loss": 0.1513, + "step": 18518 + }, + { + "epoch": 0.296304, + "grad_norm": 0.86328125, + "learning_rate": 7.093870967741936e-05, + "loss": 0.1249, + "step": 18519 + }, + { + "epoch": 0.29632, + "grad_norm": 1.1953125, + "learning_rate": 7.093709677419356e-05, + "loss": 0.1423, + "step": 18520 + }, + { + "epoch": 0.296336, + "grad_norm": 0.82421875, + "learning_rate": 7.093548387096775e-05, + "loss": 0.1925, + "step": 18521 + }, + { + "epoch": 0.296352, + "grad_norm": 0.70703125, + "learning_rate": 7.093387096774195e-05, + "loss": 0.1733, + "step": 18522 + }, + { + "epoch": 0.296368, + "grad_norm": 0.5625, + "learning_rate": 7.093225806451613e-05, + "loss": 0.1431, + "step": 18523 + }, + { + "epoch": 0.296384, + "grad_norm": 0.453125, + "learning_rate": 7.093064516129032e-05, + "loss": 0.149, + "step": 18524 + }, + { + "epoch": 0.2964, + "grad_norm": 1.2578125, + "learning_rate": 7.092903225806452e-05, + "loss": 0.1716, + "step": 18525 + }, + { + "epoch": 0.296416, + "grad_norm": 0.67578125, + "learning_rate": 7.09274193548387e-05, + "loss": 0.1849, + "step": 18526 + }, + { + "epoch": 0.296432, + "grad_norm": 1.125, + "learning_rate": 7.09258064516129e-05, + "loss": 0.2074, + "step": 18527 + }, + { + "epoch": 0.296448, + "grad_norm": 0.80859375, + "learning_rate": 7.09241935483871e-05, + "loss": 0.158, + "step": 18528 + }, + { + "epoch": 0.296464, + "grad_norm": 0.9375, + "learning_rate": 7.09225806451613e-05, + "loss": 0.1747, + "step": 18529 + }, + { + "epoch": 0.29648, + "grad_norm": 0.57421875, + "learning_rate": 7.092096774193549e-05, + "loss": 0.1704, + "step": 18530 + }, + { + "epoch": 0.296496, + "grad_norm": 0.765625, + "learning_rate": 7.091935483870969e-05, + "loss": 0.1643, + "step": 18531 + }, + { + "epoch": 0.296512, + "grad_norm": 1.0234375, + "learning_rate": 7.091774193548387e-05, + "loss": 0.2001, + "step": 18532 + }, + { + "epoch": 0.296528, + "grad_norm": 0.8515625, + "learning_rate": 7.091612903225807e-05, + "loss": 0.1721, + "step": 18533 + }, + { + "epoch": 0.296544, + "grad_norm": 1.0859375, + "learning_rate": 7.091451612903226e-05, + "loss": 0.2122, + "step": 18534 + }, + { + "epoch": 0.29656, + "grad_norm": 0.6484375, + "learning_rate": 7.091290322580646e-05, + "loss": 0.1581, + "step": 18535 + }, + { + "epoch": 0.296576, + "grad_norm": 0.9375, + "learning_rate": 7.091129032258065e-05, + "loss": 0.2062, + "step": 18536 + }, + { + "epoch": 0.296592, + "grad_norm": 0.7421875, + "learning_rate": 7.090967741935485e-05, + "loss": 0.1845, + "step": 18537 + }, + { + "epoch": 0.296608, + "grad_norm": 0.71875, + "learning_rate": 7.090806451612903e-05, + "loss": 0.1676, + "step": 18538 + }, + { + "epoch": 0.296624, + "grad_norm": 1.578125, + "learning_rate": 7.090645161290323e-05, + "loss": 0.1951, + "step": 18539 + }, + { + "epoch": 0.29664, + "grad_norm": 0.984375, + "learning_rate": 7.090483870967742e-05, + "loss": 0.2138, + "step": 18540 + }, + { + "epoch": 0.296656, + "grad_norm": 0.6640625, + "learning_rate": 7.090322580645162e-05, + "loss": 0.1696, + "step": 18541 + }, + { + "epoch": 0.296672, + "grad_norm": 1.0, + "learning_rate": 7.090161290322582e-05, + "loss": 0.1473, + "step": 18542 + }, + { + "epoch": 0.296688, + "grad_norm": 0.86328125, + "learning_rate": 7.09e-05, + "loss": 0.2249, + "step": 18543 + }, + { + "epoch": 0.296704, + "grad_norm": 1.0078125, + "learning_rate": 7.08983870967742e-05, + "loss": 0.1509, + "step": 18544 + }, + { + "epoch": 0.29672, + "grad_norm": 0.76953125, + "learning_rate": 7.089677419354839e-05, + "loss": 0.1465, + "step": 18545 + }, + { + "epoch": 0.296736, + "grad_norm": 0.625, + "learning_rate": 7.089516129032259e-05, + "loss": 0.1561, + "step": 18546 + }, + { + "epoch": 0.296752, + "grad_norm": 0.99609375, + "learning_rate": 7.089354838709677e-05, + "loss": 0.188, + "step": 18547 + }, + { + "epoch": 0.296768, + "grad_norm": 0.625, + "learning_rate": 7.089193548387097e-05, + "loss": 0.1406, + "step": 18548 + }, + { + "epoch": 0.296784, + "grad_norm": 0.625, + "learning_rate": 7.089032258064516e-05, + "loss": 0.1576, + "step": 18549 + }, + { + "epoch": 0.2968, + "grad_norm": 0.61328125, + "learning_rate": 7.088870967741936e-05, + "loss": 0.1477, + "step": 18550 + }, + { + "epoch": 0.296816, + "grad_norm": 0.84765625, + "learning_rate": 7.088709677419355e-05, + "loss": 0.1837, + "step": 18551 + }, + { + "epoch": 0.296832, + "grad_norm": 0.8671875, + "learning_rate": 7.088548387096774e-05, + "loss": 0.2196, + "step": 18552 + }, + { + "epoch": 0.296848, + "grad_norm": 0.91796875, + "learning_rate": 7.088387096774194e-05, + "loss": 0.1809, + "step": 18553 + }, + { + "epoch": 0.296864, + "grad_norm": 1.234375, + "learning_rate": 7.088225806451613e-05, + "loss": 0.1442, + "step": 18554 + }, + { + "epoch": 0.29688, + "grad_norm": 0.703125, + "learning_rate": 7.088064516129033e-05, + "loss": 0.2151, + "step": 18555 + }, + { + "epoch": 0.296896, + "grad_norm": 0.451171875, + "learning_rate": 7.087903225806452e-05, + "loss": 0.1382, + "step": 18556 + }, + { + "epoch": 0.296912, + "grad_norm": 0.7734375, + "learning_rate": 7.087741935483872e-05, + "loss": 0.1757, + "step": 18557 + }, + { + "epoch": 0.296928, + "grad_norm": 0.5390625, + "learning_rate": 7.08758064516129e-05, + "loss": 0.1816, + "step": 18558 + }, + { + "epoch": 0.296944, + "grad_norm": 0.85546875, + "learning_rate": 7.08741935483871e-05, + "loss": 0.1649, + "step": 18559 + }, + { + "epoch": 0.29696, + "grad_norm": 1.1171875, + "learning_rate": 7.087258064516129e-05, + "loss": 0.1553, + "step": 18560 + }, + { + "epoch": 0.296976, + "grad_norm": 0.99609375, + "learning_rate": 7.087096774193549e-05, + "loss": 0.1641, + "step": 18561 + }, + { + "epoch": 0.296992, + "grad_norm": 0.69921875, + "learning_rate": 7.086935483870967e-05, + "loss": 0.1559, + "step": 18562 + }, + { + "epoch": 0.297008, + "grad_norm": 0.703125, + "learning_rate": 7.086774193548387e-05, + "loss": 0.1785, + "step": 18563 + }, + { + "epoch": 0.297024, + "grad_norm": 0.93359375, + "learning_rate": 7.086612903225807e-05, + "loss": 0.2025, + "step": 18564 + }, + { + "epoch": 0.29704, + "grad_norm": 0.55078125, + "learning_rate": 7.086451612903227e-05, + "loss": 0.148, + "step": 18565 + }, + { + "epoch": 0.297056, + "grad_norm": 0.765625, + "learning_rate": 7.086290322580646e-05, + "loss": 0.1941, + "step": 18566 + }, + { + "epoch": 0.297072, + "grad_norm": 1.0546875, + "learning_rate": 7.086129032258066e-05, + "loss": 0.1987, + "step": 18567 + }, + { + "epoch": 0.297088, + "grad_norm": 1.0078125, + "learning_rate": 7.085967741935484e-05, + "loss": 0.1825, + "step": 18568 + }, + { + "epoch": 0.297104, + "grad_norm": 0.75, + "learning_rate": 7.085806451612903e-05, + "loss": 0.1491, + "step": 18569 + }, + { + "epoch": 0.29712, + "grad_norm": 0.90625, + "learning_rate": 7.085645161290323e-05, + "loss": 0.2205, + "step": 18570 + }, + { + "epoch": 0.297136, + "grad_norm": 0.8359375, + "learning_rate": 7.085483870967742e-05, + "loss": 0.1619, + "step": 18571 + }, + { + "epoch": 0.297152, + "grad_norm": 0.72265625, + "learning_rate": 7.085322580645162e-05, + "loss": 0.1617, + "step": 18572 + }, + { + "epoch": 0.297168, + "grad_norm": 0.6875, + "learning_rate": 7.08516129032258e-05, + "loss": 0.149, + "step": 18573 + }, + { + "epoch": 0.297184, + "grad_norm": 0.7421875, + "learning_rate": 7.085e-05, + "loss": 0.178, + "step": 18574 + }, + { + "epoch": 0.2972, + "grad_norm": 0.7109375, + "learning_rate": 7.084838709677419e-05, + "loss": 0.1887, + "step": 18575 + }, + { + "epoch": 0.297216, + "grad_norm": 1.1171875, + "learning_rate": 7.084677419354839e-05, + "loss": 0.1639, + "step": 18576 + }, + { + "epoch": 0.297232, + "grad_norm": 0.796875, + "learning_rate": 7.084516129032259e-05, + "loss": 0.185, + "step": 18577 + }, + { + "epoch": 0.297248, + "grad_norm": 0.640625, + "learning_rate": 7.084354838709679e-05, + "loss": 0.1405, + "step": 18578 + }, + { + "epoch": 0.297264, + "grad_norm": 0.73046875, + "learning_rate": 7.084193548387097e-05, + "loss": 0.1441, + "step": 18579 + }, + { + "epoch": 0.29728, + "grad_norm": 0.75, + "learning_rate": 7.084032258064517e-05, + "loss": 0.1412, + "step": 18580 + }, + { + "epoch": 0.297296, + "grad_norm": 0.80859375, + "learning_rate": 7.083870967741936e-05, + "loss": 0.1127, + "step": 18581 + }, + { + "epoch": 0.297312, + "grad_norm": 0.46875, + "learning_rate": 7.083709677419356e-05, + "loss": 0.151, + "step": 18582 + }, + { + "epoch": 0.297328, + "grad_norm": 1.0859375, + "learning_rate": 7.083548387096774e-05, + "loss": 0.1202, + "step": 18583 + }, + { + "epoch": 0.297344, + "grad_norm": 0.5390625, + "learning_rate": 7.083387096774194e-05, + "loss": 0.1482, + "step": 18584 + }, + { + "epoch": 0.29736, + "grad_norm": 0.58203125, + "learning_rate": 7.083225806451613e-05, + "loss": 0.1164, + "step": 18585 + }, + { + "epoch": 0.297376, + "grad_norm": 0.63671875, + "learning_rate": 7.083064516129032e-05, + "loss": 0.169, + "step": 18586 + }, + { + "epoch": 0.297392, + "grad_norm": 1.0390625, + "learning_rate": 7.082903225806451e-05, + "loss": 0.1362, + "step": 18587 + }, + { + "epoch": 0.297408, + "grad_norm": 1.109375, + "learning_rate": 7.082741935483871e-05, + "loss": 0.2746, + "step": 18588 + }, + { + "epoch": 0.297424, + "grad_norm": 0.80078125, + "learning_rate": 7.082580645161291e-05, + "loss": 0.1467, + "step": 18589 + }, + { + "epoch": 0.29744, + "grad_norm": 0.78125, + "learning_rate": 7.08241935483871e-05, + "loss": 0.1555, + "step": 18590 + }, + { + "epoch": 0.297456, + "grad_norm": 0.7734375, + "learning_rate": 7.08225806451613e-05, + "loss": 0.1896, + "step": 18591 + }, + { + "epoch": 0.297472, + "grad_norm": 0.62109375, + "learning_rate": 7.082096774193549e-05, + "loss": 0.142, + "step": 18592 + }, + { + "epoch": 0.297488, + "grad_norm": 0.72265625, + "learning_rate": 7.081935483870969e-05, + "loss": 0.1564, + "step": 18593 + }, + { + "epoch": 0.297504, + "grad_norm": 0.59765625, + "learning_rate": 7.081774193548387e-05, + "loss": 0.108, + "step": 18594 + }, + { + "epoch": 0.29752, + "grad_norm": 0.92578125, + "learning_rate": 7.081612903225807e-05, + "loss": 0.1499, + "step": 18595 + }, + { + "epoch": 0.297536, + "grad_norm": 0.640625, + "learning_rate": 7.081451612903226e-05, + "loss": 0.1547, + "step": 18596 + }, + { + "epoch": 0.297552, + "grad_norm": 0.5546875, + "learning_rate": 7.081290322580646e-05, + "loss": 0.1444, + "step": 18597 + }, + { + "epoch": 0.297568, + "grad_norm": 0.81640625, + "learning_rate": 7.081129032258064e-05, + "loss": 0.1637, + "step": 18598 + }, + { + "epoch": 0.297584, + "grad_norm": 0.55859375, + "learning_rate": 7.080967741935484e-05, + "loss": 0.1437, + "step": 18599 + }, + { + "epoch": 0.2976, + "grad_norm": 0.63671875, + "learning_rate": 7.080806451612904e-05, + "loss": 0.1486, + "step": 18600 + }, + { + "epoch": 0.297616, + "grad_norm": 1.0234375, + "learning_rate": 7.080645161290323e-05, + "loss": 0.1921, + "step": 18601 + }, + { + "epoch": 0.297632, + "grad_norm": 1.109375, + "learning_rate": 7.080483870967743e-05, + "loss": 0.1703, + "step": 18602 + }, + { + "epoch": 0.297648, + "grad_norm": 0.93359375, + "learning_rate": 7.080322580645161e-05, + "loss": 0.1597, + "step": 18603 + }, + { + "epoch": 0.297664, + "grad_norm": 0.69921875, + "learning_rate": 7.080161290322581e-05, + "loss": 0.1598, + "step": 18604 + }, + { + "epoch": 0.29768, + "grad_norm": 0.71484375, + "learning_rate": 7.08e-05, + "loss": 0.1709, + "step": 18605 + }, + { + "epoch": 0.297696, + "grad_norm": 0.87109375, + "learning_rate": 7.07983870967742e-05, + "loss": 0.1728, + "step": 18606 + }, + { + "epoch": 0.297712, + "grad_norm": 0.8828125, + "learning_rate": 7.079677419354839e-05, + "loss": 0.1823, + "step": 18607 + }, + { + "epoch": 0.297728, + "grad_norm": 1.4453125, + "learning_rate": 7.079516129032259e-05, + "loss": 0.2272, + "step": 18608 + }, + { + "epoch": 0.297744, + "grad_norm": 0.92578125, + "learning_rate": 7.079354838709677e-05, + "loss": 0.1815, + "step": 18609 + }, + { + "epoch": 0.29776, + "grad_norm": 0.85546875, + "learning_rate": 7.079193548387097e-05, + "loss": 0.1877, + "step": 18610 + }, + { + "epoch": 0.297776, + "grad_norm": 0.546875, + "learning_rate": 7.079032258064516e-05, + "loss": 0.1445, + "step": 18611 + }, + { + "epoch": 0.297792, + "grad_norm": 0.66015625, + "learning_rate": 7.078870967741936e-05, + "loss": 0.1927, + "step": 18612 + }, + { + "epoch": 0.297808, + "grad_norm": 0.59375, + "learning_rate": 7.078709677419356e-05, + "loss": 0.1834, + "step": 18613 + }, + { + "epoch": 0.297824, + "grad_norm": 0.73046875, + "learning_rate": 7.078548387096776e-05, + "loss": 0.1659, + "step": 18614 + }, + { + "epoch": 0.29784, + "grad_norm": 0.91796875, + "learning_rate": 7.078387096774194e-05, + "loss": 0.1194, + "step": 18615 + }, + { + "epoch": 0.297856, + "grad_norm": 0.70703125, + "learning_rate": 7.078225806451613e-05, + "loss": 0.1516, + "step": 18616 + }, + { + "epoch": 0.297872, + "grad_norm": 0.96875, + "learning_rate": 7.078064516129033e-05, + "loss": 0.1402, + "step": 18617 + }, + { + "epoch": 0.297888, + "grad_norm": 0.6875, + "learning_rate": 7.077903225806451e-05, + "loss": 0.138, + "step": 18618 + }, + { + "epoch": 0.297904, + "grad_norm": 0.48828125, + "learning_rate": 7.077741935483871e-05, + "loss": 0.146, + "step": 18619 + }, + { + "epoch": 0.29792, + "grad_norm": 0.6953125, + "learning_rate": 7.07758064516129e-05, + "loss": 0.154, + "step": 18620 + }, + { + "epoch": 0.297936, + "grad_norm": 0.7890625, + "learning_rate": 7.07741935483871e-05, + "loss": 0.1813, + "step": 18621 + }, + { + "epoch": 0.297952, + "grad_norm": 0.74609375, + "learning_rate": 7.077258064516129e-05, + "loss": 0.1655, + "step": 18622 + }, + { + "epoch": 0.297968, + "grad_norm": 0.84375, + "learning_rate": 7.077096774193548e-05, + "loss": 0.1665, + "step": 18623 + }, + { + "epoch": 0.297984, + "grad_norm": 0.734375, + "learning_rate": 7.076935483870968e-05, + "loss": 0.1727, + "step": 18624 + }, + { + "epoch": 0.298, + "grad_norm": 0.5703125, + "learning_rate": 7.076774193548388e-05, + "loss": 0.1373, + "step": 18625 + }, + { + "epoch": 0.298016, + "grad_norm": 0.65625, + "learning_rate": 7.076612903225807e-05, + "loss": 0.1584, + "step": 18626 + }, + { + "epoch": 0.298032, + "grad_norm": 0.7421875, + "learning_rate": 7.076451612903227e-05, + "loss": 0.1662, + "step": 18627 + }, + { + "epoch": 0.298048, + "grad_norm": 0.84765625, + "learning_rate": 7.076290322580646e-05, + "loss": 0.1454, + "step": 18628 + }, + { + "epoch": 0.298064, + "grad_norm": 1.2890625, + "learning_rate": 7.076129032258066e-05, + "loss": 0.1924, + "step": 18629 + }, + { + "epoch": 0.29808, + "grad_norm": 0.55078125, + "learning_rate": 7.075967741935484e-05, + "loss": 0.1707, + "step": 18630 + }, + { + "epoch": 0.298096, + "grad_norm": 0.703125, + "learning_rate": 7.075806451612904e-05, + "loss": 0.1871, + "step": 18631 + }, + { + "epoch": 0.298112, + "grad_norm": 0.62890625, + "learning_rate": 7.075645161290323e-05, + "loss": 0.1501, + "step": 18632 + }, + { + "epoch": 0.298128, + "grad_norm": 0.62890625, + "learning_rate": 7.075483870967741e-05, + "loss": 0.1843, + "step": 18633 + }, + { + "epoch": 0.298144, + "grad_norm": 0.69921875, + "learning_rate": 7.075322580645161e-05, + "loss": 0.1948, + "step": 18634 + }, + { + "epoch": 0.29816, + "grad_norm": 0.63671875, + "learning_rate": 7.075161290322581e-05, + "loss": 0.1477, + "step": 18635 + }, + { + "epoch": 0.298176, + "grad_norm": 0.76953125, + "learning_rate": 7.075e-05, + "loss": 0.1774, + "step": 18636 + }, + { + "epoch": 0.298192, + "grad_norm": 1.1953125, + "learning_rate": 7.07483870967742e-05, + "loss": 0.1799, + "step": 18637 + }, + { + "epoch": 0.298208, + "grad_norm": 0.984375, + "learning_rate": 7.07467741935484e-05, + "loss": 0.1886, + "step": 18638 + }, + { + "epoch": 0.298224, + "grad_norm": 0.59765625, + "learning_rate": 7.074516129032258e-05, + "loss": 0.1535, + "step": 18639 + }, + { + "epoch": 0.29824, + "grad_norm": 0.62890625, + "learning_rate": 7.074354838709678e-05, + "loss": 0.2012, + "step": 18640 + }, + { + "epoch": 0.298256, + "grad_norm": 0.87109375, + "learning_rate": 7.074193548387097e-05, + "loss": 0.1893, + "step": 18641 + }, + { + "epoch": 0.298272, + "grad_norm": 0.921875, + "learning_rate": 7.074032258064517e-05, + "loss": 0.2576, + "step": 18642 + }, + { + "epoch": 0.298288, + "grad_norm": 0.9765625, + "learning_rate": 7.073870967741936e-05, + "loss": 0.1961, + "step": 18643 + }, + { + "epoch": 0.298304, + "grad_norm": 1.1171875, + "learning_rate": 7.073709677419355e-05, + "loss": 0.2018, + "step": 18644 + }, + { + "epoch": 0.29832, + "grad_norm": 1.0859375, + "learning_rate": 7.073548387096774e-05, + "loss": 0.1924, + "step": 18645 + }, + { + "epoch": 0.298336, + "grad_norm": 0.7890625, + "learning_rate": 7.073387096774194e-05, + "loss": 0.1306, + "step": 18646 + }, + { + "epoch": 0.298352, + "grad_norm": 0.84375, + "learning_rate": 7.073225806451613e-05, + "loss": 0.1427, + "step": 18647 + }, + { + "epoch": 0.298368, + "grad_norm": 0.72265625, + "learning_rate": 7.073064516129033e-05, + "loss": 0.1474, + "step": 18648 + }, + { + "epoch": 0.298384, + "grad_norm": 0.75, + "learning_rate": 7.072903225806453e-05, + "loss": 0.1677, + "step": 18649 + }, + { + "epoch": 0.2984, + "grad_norm": 0.6484375, + "learning_rate": 7.072741935483871e-05, + "loss": 0.1731, + "step": 18650 + }, + { + "epoch": 0.298416, + "grad_norm": 0.83203125, + "learning_rate": 7.072580645161291e-05, + "loss": 0.1823, + "step": 18651 + }, + { + "epoch": 0.298432, + "grad_norm": 0.69921875, + "learning_rate": 7.07241935483871e-05, + "loss": 0.1697, + "step": 18652 + }, + { + "epoch": 0.298448, + "grad_norm": 1.1171875, + "learning_rate": 7.07225806451613e-05, + "loss": 0.1687, + "step": 18653 + }, + { + "epoch": 0.298464, + "grad_norm": 1.375, + "learning_rate": 7.072096774193548e-05, + "loss": 0.2139, + "step": 18654 + }, + { + "epoch": 0.29848, + "grad_norm": 1.0078125, + "learning_rate": 7.071935483870968e-05, + "loss": 0.1507, + "step": 18655 + }, + { + "epoch": 0.298496, + "grad_norm": 0.80078125, + "learning_rate": 7.071774193548387e-05, + "loss": 0.1806, + "step": 18656 + }, + { + "epoch": 0.298512, + "grad_norm": 1.03125, + "learning_rate": 7.071612903225807e-05, + "loss": 0.1882, + "step": 18657 + }, + { + "epoch": 0.298528, + "grad_norm": 0.73828125, + "learning_rate": 7.071451612903225e-05, + "loss": 0.1387, + "step": 18658 + }, + { + "epoch": 0.298544, + "grad_norm": 1.1875, + "learning_rate": 7.071290322580645e-05, + "loss": 0.164, + "step": 18659 + }, + { + "epoch": 0.29856, + "grad_norm": 0.7734375, + "learning_rate": 7.071129032258065e-05, + "loss": 0.1437, + "step": 18660 + }, + { + "epoch": 0.298576, + "grad_norm": 0.6953125, + "learning_rate": 7.070967741935485e-05, + "loss": 0.1588, + "step": 18661 + }, + { + "epoch": 0.298592, + "grad_norm": 0.765625, + "learning_rate": 7.070806451612904e-05, + "loss": 0.1875, + "step": 18662 + }, + { + "epoch": 0.298608, + "grad_norm": 0.8125, + "learning_rate": 7.070645161290323e-05, + "loss": 0.1846, + "step": 18663 + }, + { + "epoch": 0.298624, + "grad_norm": 0.94921875, + "learning_rate": 7.070483870967743e-05, + "loss": 0.1662, + "step": 18664 + }, + { + "epoch": 0.29864, + "grad_norm": 0.9765625, + "learning_rate": 7.070322580645161e-05, + "loss": 0.1507, + "step": 18665 + }, + { + "epoch": 0.298656, + "grad_norm": 0.703125, + "learning_rate": 7.070161290322581e-05, + "loss": 0.1723, + "step": 18666 + }, + { + "epoch": 0.298672, + "grad_norm": 0.6484375, + "learning_rate": 7.07e-05, + "loss": 0.1408, + "step": 18667 + }, + { + "epoch": 0.298688, + "grad_norm": 1.171875, + "learning_rate": 7.06983870967742e-05, + "loss": 0.1482, + "step": 18668 + }, + { + "epoch": 0.298704, + "grad_norm": 0.76171875, + "learning_rate": 7.069677419354838e-05, + "loss": 0.1707, + "step": 18669 + }, + { + "epoch": 0.29872, + "grad_norm": 0.5859375, + "learning_rate": 7.069516129032258e-05, + "loss": 0.1794, + "step": 18670 + }, + { + "epoch": 0.298736, + "grad_norm": 0.7578125, + "learning_rate": 7.069354838709677e-05, + "loss": 0.1556, + "step": 18671 + }, + { + "epoch": 0.298752, + "grad_norm": 0.59375, + "learning_rate": 7.069193548387097e-05, + "loss": 0.1259, + "step": 18672 + }, + { + "epoch": 0.298768, + "grad_norm": 0.78515625, + "learning_rate": 7.069032258064517e-05, + "loss": 0.1789, + "step": 18673 + }, + { + "epoch": 0.298784, + "grad_norm": 0.79296875, + "learning_rate": 7.068870967741937e-05, + "loss": 0.1319, + "step": 18674 + }, + { + "epoch": 0.2988, + "grad_norm": 1.1015625, + "learning_rate": 7.068709677419355e-05, + "loss": 0.1694, + "step": 18675 + }, + { + "epoch": 0.298816, + "grad_norm": 1.25, + "learning_rate": 7.068548387096775e-05, + "loss": 0.1808, + "step": 18676 + }, + { + "epoch": 0.298832, + "grad_norm": 0.56640625, + "learning_rate": 7.068387096774194e-05, + "loss": 0.1758, + "step": 18677 + }, + { + "epoch": 0.298848, + "grad_norm": 1.0859375, + "learning_rate": 7.068225806451613e-05, + "loss": 0.1344, + "step": 18678 + }, + { + "epoch": 0.298864, + "grad_norm": 0.828125, + "learning_rate": 7.068064516129033e-05, + "loss": 0.1515, + "step": 18679 + }, + { + "epoch": 0.29888, + "grad_norm": 0.75390625, + "learning_rate": 7.067903225806451e-05, + "loss": 0.1903, + "step": 18680 + }, + { + "epoch": 0.298896, + "grad_norm": 0.75390625, + "learning_rate": 7.067741935483871e-05, + "loss": 0.164, + "step": 18681 + }, + { + "epoch": 0.298912, + "grad_norm": 0.51171875, + "learning_rate": 7.06758064516129e-05, + "loss": 0.1525, + "step": 18682 + }, + { + "epoch": 0.298928, + "grad_norm": 1.0546875, + "learning_rate": 7.06741935483871e-05, + "loss": 0.1545, + "step": 18683 + }, + { + "epoch": 0.298944, + "grad_norm": 0.62890625, + "learning_rate": 7.06725806451613e-05, + "loss": 0.1549, + "step": 18684 + }, + { + "epoch": 0.29896, + "grad_norm": 0.69921875, + "learning_rate": 7.06709677419355e-05, + "loss": 0.1337, + "step": 18685 + }, + { + "epoch": 0.298976, + "grad_norm": 0.94140625, + "learning_rate": 7.066935483870968e-05, + "loss": 0.1647, + "step": 18686 + }, + { + "epoch": 0.298992, + "grad_norm": 1.578125, + "learning_rate": 7.066774193548388e-05, + "loss": 0.2221, + "step": 18687 + }, + { + "epoch": 0.299008, + "grad_norm": 1.3046875, + "learning_rate": 7.066612903225807e-05, + "loss": 0.1784, + "step": 18688 + }, + { + "epoch": 0.299024, + "grad_norm": 0.91796875, + "learning_rate": 7.066451612903227e-05, + "loss": 0.1942, + "step": 18689 + }, + { + "epoch": 0.29904, + "grad_norm": 0.546875, + "learning_rate": 7.066290322580645e-05, + "loss": 0.152, + "step": 18690 + }, + { + "epoch": 0.299056, + "grad_norm": 0.69921875, + "learning_rate": 7.066129032258065e-05, + "loss": 0.1711, + "step": 18691 + }, + { + "epoch": 0.299072, + "grad_norm": 0.81640625, + "learning_rate": 7.065967741935484e-05, + "loss": 0.2082, + "step": 18692 + }, + { + "epoch": 0.299088, + "grad_norm": 0.703125, + "learning_rate": 7.065806451612904e-05, + "loss": 0.1805, + "step": 18693 + }, + { + "epoch": 0.299104, + "grad_norm": 0.69140625, + "learning_rate": 7.065645161290322e-05, + "loss": 0.1668, + "step": 18694 + }, + { + "epoch": 0.29912, + "grad_norm": 0.88671875, + "learning_rate": 7.065483870967742e-05, + "loss": 0.1527, + "step": 18695 + }, + { + "epoch": 0.299136, + "grad_norm": 0.6171875, + "learning_rate": 7.065322580645162e-05, + "loss": 0.1698, + "step": 18696 + }, + { + "epoch": 0.299152, + "grad_norm": 0.6171875, + "learning_rate": 7.065161290322581e-05, + "loss": 0.1247, + "step": 18697 + }, + { + "epoch": 0.299168, + "grad_norm": 0.7265625, + "learning_rate": 7.065000000000001e-05, + "loss": 0.1807, + "step": 18698 + }, + { + "epoch": 0.299184, + "grad_norm": 0.90234375, + "learning_rate": 7.06483870967742e-05, + "loss": 0.186, + "step": 18699 + }, + { + "epoch": 0.2992, + "grad_norm": 0.98828125, + "learning_rate": 7.06467741935484e-05, + "loss": 0.1858, + "step": 18700 + }, + { + "epoch": 0.299216, + "grad_norm": 0.9453125, + "learning_rate": 7.064516129032258e-05, + "loss": 0.1575, + "step": 18701 + }, + { + "epoch": 0.299232, + "grad_norm": 0.58203125, + "learning_rate": 7.064354838709678e-05, + "loss": 0.1604, + "step": 18702 + }, + { + "epoch": 0.299248, + "grad_norm": 1.03125, + "learning_rate": 7.064193548387097e-05, + "loss": 0.1536, + "step": 18703 + }, + { + "epoch": 0.299264, + "grad_norm": 0.671875, + "learning_rate": 7.064032258064517e-05, + "loss": 0.148, + "step": 18704 + }, + { + "epoch": 0.29928, + "grad_norm": 0.79296875, + "learning_rate": 7.063870967741935e-05, + "loss": 0.1654, + "step": 18705 + }, + { + "epoch": 0.299296, + "grad_norm": 0.625, + "learning_rate": 7.063709677419355e-05, + "loss": 0.1423, + "step": 18706 + }, + { + "epoch": 0.299312, + "grad_norm": 0.61328125, + "learning_rate": 7.063548387096774e-05, + "loss": 0.1725, + "step": 18707 + }, + { + "epoch": 0.299328, + "grad_norm": 0.91796875, + "learning_rate": 7.063387096774194e-05, + "loss": 0.1532, + "step": 18708 + }, + { + "epoch": 0.299344, + "grad_norm": 0.90625, + "learning_rate": 7.063225806451614e-05, + "loss": 0.1457, + "step": 18709 + }, + { + "epoch": 0.29936, + "grad_norm": 1.328125, + "learning_rate": 7.063064516129032e-05, + "loss": 0.1595, + "step": 18710 + }, + { + "epoch": 0.299376, + "grad_norm": 0.796875, + "learning_rate": 7.062903225806452e-05, + "loss": 0.1791, + "step": 18711 + }, + { + "epoch": 0.299392, + "grad_norm": 1.3046875, + "learning_rate": 7.062741935483871e-05, + "loss": 0.1925, + "step": 18712 + }, + { + "epoch": 0.299408, + "grad_norm": 1.0859375, + "learning_rate": 7.062580645161291e-05, + "loss": 0.1659, + "step": 18713 + }, + { + "epoch": 0.299424, + "grad_norm": 1.0234375, + "learning_rate": 7.06241935483871e-05, + "loss": 0.1733, + "step": 18714 + }, + { + "epoch": 0.29944, + "grad_norm": 0.765625, + "learning_rate": 7.06225806451613e-05, + "loss": 0.1825, + "step": 18715 + }, + { + "epoch": 0.299456, + "grad_norm": 0.7109375, + "learning_rate": 7.062096774193548e-05, + "loss": 0.1808, + "step": 18716 + }, + { + "epoch": 0.299472, + "grad_norm": 1.0078125, + "learning_rate": 7.061935483870968e-05, + "loss": 0.2039, + "step": 18717 + }, + { + "epoch": 0.299488, + "grad_norm": 0.8203125, + "learning_rate": 7.061774193548387e-05, + "loss": 0.1602, + "step": 18718 + }, + { + "epoch": 0.299504, + "grad_norm": 0.77734375, + "learning_rate": 7.061612903225807e-05, + "loss": 0.1814, + "step": 18719 + }, + { + "epoch": 0.29952, + "grad_norm": 0.5703125, + "learning_rate": 7.061451612903227e-05, + "loss": 0.1614, + "step": 18720 + }, + { + "epoch": 0.299536, + "grad_norm": 1.0625, + "learning_rate": 7.061290322580647e-05, + "loss": 0.1869, + "step": 18721 + }, + { + "epoch": 0.299552, + "grad_norm": 0.5625, + "learning_rate": 7.061129032258065e-05, + "loss": 0.1653, + "step": 18722 + }, + { + "epoch": 0.299568, + "grad_norm": 0.63671875, + "learning_rate": 7.060967741935485e-05, + "loss": 0.1301, + "step": 18723 + }, + { + "epoch": 0.299584, + "grad_norm": 0.7421875, + "learning_rate": 7.060806451612904e-05, + "loss": 0.1911, + "step": 18724 + }, + { + "epoch": 0.2996, + "grad_norm": 0.5703125, + "learning_rate": 7.060645161290322e-05, + "loss": 0.1467, + "step": 18725 + }, + { + "epoch": 0.299616, + "grad_norm": 0.58203125, + "learning_rate": 7.060483870967742e-05, + "loss": 0.1199, + "step": 18726 + }, + { + "epoch": 0.299632, + "grad_norm": 0.69921875, + "learning_rate": 7.060322580645161e-05, + "loss": 0.1923, + "step": 18727 + }, + { + "epoch": 0.299648, + "grad_norm": 1.140625, + "learning_rate": 7.060161290322581e-05, + "loss": 0.2306, + "step": 18728 + }, + { + "epoch": 0.299664, + "grad_norm": 0.70703125, + "learning_rate": 7.06e-05, + "loss": 0.1657, + "step": 18729 + }, + { + "epoch": 0.29968, + "grad_norm": 0.734375, + "learning_rate": 7.05983870967742e-05, + "loss": 0.1646, + "step": 18730 + }, + { + "epoch": 0.299696, + "grad_norm": 0.9453125, + "learning_rate": 7.059677419354838e-05, + "loss": 0.1829, + "step": 18731 + }, + { + "epoch": 0.299712, + "grad_norm": 1.140625, + "learning_rate": 7.059516129032258e-05, + "loss": 0.2261, + "step": 18732 + }, + { + "epoch": 0.299728, + "grad_norm": 0.62109375, + "learning_rate": 7.059354838709678e-05, + "loss": 0.2163, + "step": 18733 + }, + { + "epoch": 0.299744, + "grad_norm": 1.203125, + "learning_rate": 7.059193548387098e-05, + "loss": 0.2086, + "step": 18734 + }, + { + "epoch": 0.29976, + "grad_norm": 2.234375, + "learning_rate": 7.059032258064517e-05, + "loss": 0.1988, + "step": 18735 + }, + { + "epoch": 0.299776, + "grad_norm": 1.5, + "learning_rate": 7.058870967741937e-05, + "loss": 0.1735, + "step": 18736 + }, + { + "epoch": 0.299792, + "grad_norm": 0.6796875, + "learning_rate": 7.058709677419355e-05, + "loss": 0.178, + "step": 18737 + }, + { + "epoch": 0.299808, + "grad_norm": 0.875, + "learning_rate": 7.058548387096775e-05, + "loss": 0.1684, + "step": 18738 + }, + { + "epoch": 0.299824, + "grad_norm": 0.60546875, + "learning_rate": 7.058387096774194e-05, + "loss": 0.1694, + "step": 18739 + }, + { + "epoch": 0.29984, + "grad_norm": 0.70703125, + "learning_rate": 7.058225806451612e-05, + "loss": 0.1841, + "step": 18740 + }, + { + "epoch": 0.299856, + "grad_norm": 0.71875, + "learning_rate": 7.058064516129032e-05, + "loss": 0.1947, + "step": 18741 + }, + { + "epoch": 0.299872, + "grad_norm": 0.5625, + "learning_rate": 7.057903225806451e-05, + "loss": 0.1646, + "step": 18742 + }, + { + "epoch": 0.299888, + "grad_norm": 1.28125, + "learning_rate": 7.057741935483871e-05, + "loss": 0.1844, + "step": 18743 + }, + { + "epoch": 0.299904, + "grad_norm": 0.671875, + "learning_rate": 7.057580645161291e-05, + "loss": 0.1826, + "step": 18744 + }, + { + "epoch": 0.29992, + "grad_norm": 0.73828125, + "learning_rate": 7.057419354838711e-05, + "loss": 0.1766, + "step": 18745 + }, + { + "epoch": 0.299936, + "grad_norm": 0.66796875, + "learning_rate": 7.05725806451613e-05, + "loss": 0.1819, + "step": 18746 + }, + { + "epoch": 0.299952, + "grad_norm": 0.69921875, + "learning_rate": 7.05709677419355e-05, + "loss": 0.1612, + "step": 18747 + }, + { + "epoch": 0.299968, + "grad_norm": 0.625, + "learning_rate": 7.056935483870968e-05, + "loss": 0.1416, + "step": 18748 + }, + { + "epoch": 0.299984, + "grad_norm": 0.671875, + "learning_rate": 7.056774193548388e-05, + "loss": 0.1441, + "step": 18749 + }, + { + "epoch": 0.3, + "grad_norm": 0.890625, + "learning_rate": 7.056612903225807e-05, + "loss": 0.1539, + "step": 18750 + }, + { + "epoch": 0.300016, + "grad_norm": 0.96875, + "learning_rate": 7.056451612903226e-05, + "loss": 0.149, + "step": 18751 + }, + { + "epoch": 0.300032, + "grad_norm": 1.7109375, + "learning_rate": 7.056290322580645e-05, + "loss": 0.1498, + "step": 18752 + }, + { + "epoch": 0.300048, + "grad_norm": 0.6953125, + "learning_rate": 7.056129032258065e-05, + "loss": 0.1673, + "step": 18753 + }, + { + "epoch": 0.300064, + "grad_norm": 0.6953125, + "learning_rate": 7.055967741935484e-05, + "loss": 0.1475, + "step": 18754 + }, + { + "epoch": 0.30008, + "grad_norm": 0.482421875, + "learning_rate": 7.055806451612904e-05, + "loss": 0.1375, + "step": 18755 + }, + { + "epoch": 0.300096, + "grad_norm": 0.91015625, + "learning_rate": 7.055645161290324e-05, + "loss": 0.1961, + "step": 18756 + }, + { + "epoch": 0.300112, + "grad_norm": 0.98828125, + "learning_rate": 7.055483870967742e-05, + "loss": 0.1791, + "step": 18757 + }, + { + "epoch": 0.300128, + "grad_norm": 0.7578125, + "learning_rate": 7.055322580645162e-05, + "loss": 0.1362, + "step": 18758 + }, + { + "epoch": 0.300144, + "grad_norm": 1.3671875, + "learning_rate": 7.055161290322581e-05, + "loss": 0.2098, + "step": 18759 + }, + { + "epoch": 0.30016, + "grad_norm": 0.62890625, + "learning_rate": 7.055000000000001e-05, + "loss": 0.1735, + "step": 18760 + }, + { + "epoch": 0.300176, + "grad_norm": 1.25, + "learning_rate": 7.05483870967742e-05, + "loss": 0.1664, + "step": 18761 + }, + { + "epoch": 0.300192, + "grad_norm": 0.6796875, + "learning_rate": 7.054677419354839e-05, + "loss": 0.1735, + "step": 18762 + }, + { + "epoch": 0.300208, + "grad_norm": 0.81640625, + "learning_rate": 7.054516129032258e-05, + "loss": 0.1638, + "step": 18763 + }, + { + "epoch": 0.300224, + "grad_norm": 0.82421875, + "learning_rate": 7.054354838709678e-05, + "loss": 0.1328, + "step": 18764 + }, + { + "epoch": 0.30024, + "grad_norm": 0.7734375, + "learning_rate": 7.054193548387096e-05, + "loss": 0.1586, + "step": 18765 + }, + { + "epoch": 0.300256, + "grad_norm": 1.71875, + "learning_rate": 7.054032258064516e-05, + "loss": 0.1984, + "step": 18766 + }, + { + "epoch": 0.300272, + "grad_norm": 0.86328125, + "learning_rate": 7.053870967741935e-05, + "loss": 0.1623, + "step": 18767 + }, + { + "epoch": 0.300288, + "grad_norm": 1.3125, + "learning_rate": 7.053709677419355e-05, + "loss": 0.184, + "step": 18768 + }, + { + "epoch": 0.300304, + "grad_norm": 0.84765625, + "learning_rate": 7.053548387096775e-05, + "loss": 0.1765, + "step": 18769 + }, + { + "epoch": 0.30032, + "grad_norm": 1.0625, + "learning_rate": 7.053387096774195e-05, + "loss": 0.2005, + "step": 18770 + }, + { + "epoch": 0.300336, + "grad_norm": 0.90625, + "learning_rate": 7.053225806451614e-05, + "loss": 0.1753, + "step": 18771 + }, + { + "epoch": 0.300352, + "grad_norm": 0.6015625, + "learning_rate": 7.053064516129032e-05, + "loss": 0.1495, + "step": 18772 + }, + { + "epoch": 0.300368, + "grad_norm": 1.2578125, + "learning_rate": 7.052903225806452e-05, + "loss": 0.1826, + "step": 18773 + }, + { + "epoch": 0.300384, + "grad_norm": 0.478515625, + "learning_rate": 7.052741935483871e-05, + "loss": 0.1228, + "step": 18774 + }, + { + "epoch": 0.3004, + "grad_norm": 1.3359375, + "learning_rate": 7.052580645161291e-05, + "loss": 0.2173, + "step": 18775 + }, + { + "epoch": 0.300416, + "grad_norm": 0.8359375, + "learning_rate": 7.052419354838709e-05, + "loss": 0.1684, + "step": 18776 + }, + { + "epoch": 0.300432, + "grad_norm": 0.796875, + "learning_rate": 7.052258064516129e-05, + "loss": 0.1803, + "step": 18777 + }, + { + "epoch": 0.300448, + "grad_norm": 1.0, + "learning_rate": 7.052096774193548e-05, + "loss": 0.179, + "step": 18778 + }, + { + "epoch": 0.300464, + "grad_norm": 0.89453125, + "learning_rate": 7.051935483870968e-05, + "loss": 0.1297, + "step": 18779 + }, + { + "epoch": 0.30048, + "grad_norm": 0.96484375, + "learning_rate": 7.051774193548388e-05, + "loss": 0.1508, + "step": 18780 + }, + { + "epoch": 0.300496, + "grad_norm": 0.5546875, + "learning_rate": 7.051612903225808e-05, + "loss": 0.1522, + "step": 18781 + }, + { + "epoch": 0.300512, + "grad_norm": 0.859375, + "learning_rate": 7.051451612903226e-05, + "loss": 0.1636, + "step": 18782 + }, + { + "epoch": 0.300528, + "grad_norm": 0.8359375, + "learning_rate": 7.051290322580646e-05, + "loss": 0.154, + "step": 18783 + }, + { + "epoch": 0.300544, + "grad_norm": 0.5703125, + "learning_rate": 7.051129032258065e-05, + "loss": 0.1628, + "step": 18784 + }, + { + "epoch": 0.30056, + "grad_norm": 0.90625, + "learning_rate": 7.050967741935485e-05, + "loss": 0.1952, + "step": 18785 + }, + { + "epoch": 0.300576, + "grad_norm": 0.75390625, + "learning_rate": 7.050806451612903e-05, + "loss": 0.1487, + "step": 18786 + }, + { + "epoch": 0.300592, + "grad_norm": 0.6640625, + "learning_rate": 7.050645161290322e-05, + "loss": 0.2029, + "step": 18787 + }, + { + "epoch": 0.300608, + "grad_norm": 0.70703125, + "learning_rate": 7.050483870967742e-05, + "loss": 0.1676, + "step": 18788 + }, + { + "epoch": 0.300624, + "grad_norm": 0.60546875, + "learning_rate": 7.05032258064516e-05, + "loss": 0.1454, + "step": 18789 + }, + { + "epoch": 0.30064, + "grad_norm": 0.640625, + "learning_rate": 7.05016129032258e-05, + "loss": 0.177, + "step": 18790 + }, + { + "epoch": 0.300656, + "grad_norm": 1.3203125, + "learning_rate": 7.05e-05, + "loss": 0.1711, + "step": 18791 + }, + { + "epoch": 0.300672, + "grad_norm": 0.7578125, + "learning_rate": 7.049838709677419e-05, + "loss": 0.1591, + "step": 18792 + }, + { + "epoch": 0.300688, + "grad_norm": 0.8046875, + "learning_rate": 7.049677419354839e-05, + "loss": 0.1929, + "step": 18793 + }, + { + "epoch": 0.300704, + "grad_norm": 0.98828125, + "learning_rate": 7.049516129032259e-05, + "loss": 0.1418, + "step": 18794 + }, + { + "epoch": 0.30072, + "grad_norm": 0.6015625, + "learning_rate": 7.049354838709678e-05, + "loss": 0.1606, + "step": 18795 + }, + { + "epoch": 0.300736, + "grad_norm": 0.71484375, + "learning_rate": 7.049193548387098e-05, + "loss": 0.1622, + "step": 18796 + }, + { + "epoch": 0.300752, + "grad_norm": 1.28125, + "learning_rate": 7.049032258064516e-05, + "loss": 0.2261, + "step": 18797 + }, + { + "epoch": 0.300768, + "grad_norm": 0.81640625, + "learning_rate": 7.048870967741936e-05, + "loss": 0.1538, + "step": 18798 + }, + { + "epoch": 0.300784, + "grad_norm": 0.59765625, + "learning_rate": 7.048709677419355e-05, + "loss": 0.1632, + "step": 18799 + }, + { + "epoch": 0.3008, + "grad_norm": 0.69921875, + "learning_rate": 7.048548387096775e-05, + "loss": 0.1705, + "step": 18800 + }, + { + "epoch": 0.300816, + "grad_norm": 0.703125, + "learning_rate": 7.048387096774193e-05, + "loss": 0.1546, + "step": 18801 + }, + { + "epoch": 0.300832, + "grad_norm": 0.7109375, + "learning_rate": 7.048225806451613e-05, + "loss": 0.1741, + "step": 18802 + }, + { + "epoch": 0.300848, + "grad_norm": 0.890625, + "learning_rate": 7.048064516129032e-05, + "loss": 0.175, + "step": 18803 + }, + { + "epoch": 0.300864, + "grad_norm": 1.28125, + "learning_rate": 7.047903225806452e-05, + "loss": 0.1714, + "step": 18804 + }, + { + "epoch": 0.30088, + "grad_norm": 1.328125, + "learning_rate": 7.047741935483872e-05, + "loss": 0.1616, + "step": 18805 + }, + { + "epoch": 0.300896, + "grad_norm": 0.859375, + "learning_rate": 7.04758064516129e-05, + "loss": 0.211, + "step": 18806 + }, + { + "epoch": 0.300912, + "grad_norm": 0.9453125, + "learning_rate": 7.04741935483871e-05, + "loss": 0.1581, + "step": 18807 + }, + { + "epoch": 0.300928, + "grad_norm": 1.4609375, + "learning_rate": 7.047258064516129e-05, + "loss": 0.2218, + "step": 18808 + }, + { + "epoch": 0.300944, + "grad_norm": 0.7890625, + "learning_rate": 7.047096774193549e-05, + "loss": 0.1579, + "step": 18809 + }, + { + "epoch": 0.30096, + "grad_norm": 0.671875, + "learning_rate": 7.046935483870968e-05, + "loss": 0.1214, + "step": 18810 + }, + { + "epoch": 0.300976, + "grad_norm": 1.0234375, + "learning_rate": 7.046774193548388e-05, + "loss": 0.1928, + "step": 18811 + }, + { + "epoch": 0.300992, + "grad_norm": 0.73828125, + "learning_rate": 7.046612903225806e-05, + "loss": 0.2104, + "step": 18812 + }, + { + "epoch": 0.301008, + "grad_norm": 1.4375, + "learning_rate": 7.046451612903226e-05, + "loss": 0.1479, + "step": 18813 + }, + { + "epoch": 0.301024, + "grad_norm": 0.75390625, + "learning_rate": 7.046290322580645e-05, + "loss": 0.1374, + "step": 18814 + }, + { + "epoch": 0.30104, + "grad_norm": 0.77734375, + "learning_rate": 7.046129032258065e-05, + "loss": 0.1484, + "step": 18815 + }, + { + "epoch": 0.301056, + "grad_norm": 0.65234375, + "learning_rate": 7.045967741935485e-05, + "loss": 0.1748, + "step": 18816 + }, + { + "epoch": 0.301072, + "grad_norm": 0.8203125, + "learning_rate": 7.045806451612905e-05, + "loss": 0.1449, + "step": 18817 + }, + { + "epoch": 0.301088, + "grad_norm": 0.765625, + "learning_rate": 7.045645161290323e-05, + "loss": 0.1736, + "step": 18818 + }, + { + "epoch": 0.301104, + "grad_norm": 0.87890625, + "learning_rate": 7.045483870967742e-05, + "loss": 0.2187, + "step": 18819 + }, + { + "epoch": 0.30112, + "grad_norm": 0.78515625, + "learning_rate": 7.045322580645162e-05, + "loss": 0.1924, + "step": 18820 + }, + { + "epoch": 0.301136, + "grad_norm": 1.1796875, + "learning_rate": 7.04516129032258e-05, + "loss": 0.1768, + "step": 18821 + }, + { + "epoch": 0.301152, + "grad_norm": 1.1328125, + "learning_rate": 7.045e-05, + "loss": 0.1681, + "step": 18822 + }, + { + "epoch": 0.301168, + "grad_norm": 0.98046875, + "learning_rate": 7.044838709677419e-05, + "loss": 0.1811, + "step": 18823 + }, + { + "epoch": 0.301184, + "grad_norm": 0.83984375, + "learning_rate": 7.044677419354839e-05, + "loss": 0.1787, + "step": 18824 + }, + { + "epoch": 0.3012, + "grad_norm": 1.203125, + "learning_rate": 7.044516129032258e-05, + "loss": 0.2112, + "step": 18825 + }, + { + "epoch": 0.301216, + "grad_norm": 1.0, + "learning_rate": 7.044354838709678e-05, + "loss": 0.1715, + "step": 18826 + }, + { + "epoch": 0.301232, + "grad_norm": 0.62890625, + "learning_rate": 7.044193548387096e-05, + "loss": 0.1515, + "step": 18827 + }, + { + "epoch": 0.301248, + "grad_norm": 0.70703125, + "learning_rate": 7.044032258064516e-05, + "loss": 0.1779, + "step": 18828 + }, + { + "epoch": 0.301264, + "grad_norm": 0.95703125, + "learning_rate": 7.043870967741936e-05, + "loss": 0.1634, + "step": 18829 + }, + { + "epoch": 0.30128, + "grad_norm": 0.80078125, + "learning_rate": 7.043709677419356e-05, + "loss": 0.1741, + "step": 18830 + }, + { + "epoch": 0.301296, + "grad_norm": 0.9453125, + "learning_rate": 7.043548387096775e-05, + "loss": 0.1878, + "step": 18831 + }, + { + "epoch": 0.301312, + "grad_norm": 0.87109375, + "learning_rate": 7.043387096774195e-05, + "loss": 0.1581, + "step": 18832 + }, + { + "epoch": 0.301328, + "grad_norm": 0.7890625, + "learning_rate": 7.043225806451613e-05, + "loss": 0.1964, + "step": 18833 + }, + { + "epoch": 0.301344, + "grad_norm": 0.65234375, + "learning_rate": 7.043064516129032e-05, + "loss": 0.1613, + "step": 18834 + }, + { + "epoch": 0.30136, + "grad_norm": 0.8359375, + "learning_rate": 7.042903225806452e-05, + "loss": 0.1794, + "step": 18835 + }, + { + "epoch": 0.301376, + "grad_norm": 0.90234375, + "learning_rate": 7.04274193548387e-05, + "loss": 0.1677, + "step": 18836 + }, + { + "epoch": 0.301392, + "grad_norm": 0.70703125, + "learning_rate": 7.04258064516129e-05, + "loss": 0.1411, + "step": 18837 + }, + { + "epoch": 0.301408, + "grad_norm": 0.87109375, + "learning_rate": 7.042419354838709e-05, + "loss": 0.1668, + "step": 18838 + }, + { + "epoch": 0.301424, + "grad_norm": 0.63671875, + "learning_rate": 7.042258064516129e-05, + "loss": 0.1829, + "step": 18839 + }, + { + "epoch": 0.30144, + "grad_norm": 1.203125, + "learning_rate": 7.042096774193549e-05, + "loss": 0.1989, + "step": 18840 + }, + { + "epoch": 0.301456, + "grad_norm": 1.0234375, + "learning_rate": 7.041935483870969e-05, + "loss": 0.1818, + "step": 18841 + }, + { + "epoch": 0.301472, + "grad_norm": 0.98046875, + "learning_rate": 7.041774193548388e-05, + "loss": 0.1754, + "step": 18842 + }, + { + "epoch": 0.301488, + "grad_norm": 0.55859375, + "learning_rate": 7.041612903225808e-05, + "loss": 0.1635, + "step": 18843 + }, + { + "epoch": 0.301504, + "grad_norm": 0.64453125, + "learning_rate": 7.041451612903226e-05, + "loss": 0.1656, + "step": 18844 + }, + { + "epoch": 0.30152, + "grad_norm": 0.66796875, + "learning_rate": 7.041290322580646e-05, + "loss": 0.1331, + "step": 18845 + }, + { + "epoch": 0.301536, + "grad_norm": 0.65625, + "learning_rate": 7.041129032258065e-05, + "loss": 0.179, + "step": 18846 + }, + { + "epoch": 0.301552, + "grad_norm": 1.296875, + "learning_rate": 7.040967741935485e-05, + "loss": 0.1826, + "step": 18847 + }, + { + "epoch": 0.301568, + "grad_norm": 1.578125, + "learning_rate": 7.040806451612903e-05, + "loss": 0.1821, + "step": 18848 + }, + { + "epoch": 0.301584, + "grad_norm": 0.46875, + "learning_rate": 7.040645161290322e-05, + "loss": 0.1597, + "step": 18849 + }, + { + "epoch": 0.3016, + "grad_norm": 0.72265625, + "learning_rate": 7.040483870967742e-05, + "loss": 0.1677, + "step": 18850 + }, + { + "epoch": 0.301616, + "grad_norm": 0.796875, + "learning_rate": 7.040322580645162e-05, + "loss": 0.1948, + "step": 18851 + }, + { + "epoch": 0.301632, + "grad_norm": 0.80859375, + "learning_rate": 7.040161290322582e-05, + "loss": 0.1744, + "step": 18852 + }, + { + "epoch": 0.301648, + "grad_norm": 0.6796875, + "learning_rate": 7.04e-05, + "loss": 0.1703, + "step": 18853 + }, + { + "epoch": 0.301664, + "grad_norm": 1.21875, + "learning_rate": 7.03983870967742e-05, + "loss": 0.1525, + "step": 18854 + }, + { + "epoch": 0.30168, + "grad_norm": 1.125, + "learning_rate": 7.039677419354839e-05, + "loss": 0.1838, + "step": 18855 + }, + { + "epoch": 0.301696, + "grad_norm": 1.296875, + "learning_rate": 7.039516129032259e-05, + "loss": 0.1896, + "step": 18856 + }, + { + "epoch": 0.301712, + "grad_norm": 0.8828125, + "learning_rate": 7.039354838709678e-05, + "loss": 0.1984, + "step": 18857 + }, + { + "epoch": 0.301728, + "grad_norm": 0.8359375, + "learning_rate": 7.039193548387097e-05, + "loss": 0.1589, + "step": 18858 + }, + { + "epoch": 0.301744, + "grad_norm": 0.88671875, + "learning_rate": 7.039032258064516e-05, + "loss": 0.1628, + "step": 18859 + }, + { + "epoch": 0.30176, + "grad_norm": 0.7109375, + "learning_rate": 7.038870967741936e-05, + "loss": 0.1742, + "step": 18860 + }, + { + "epoch": 0.301776, + "grad_norm": 0.87890625, + "learning_rate": 7.038709677419355e-05, + "loss": 0.1627, + "step": 18861 + }, + { + "epoch": 0.301792, + "grad_norm": 0.8671875, + "learning_rate": 7.038548387096775e-05, + "loss": 0.1648, + "step": 18862 + }, + { + "epoch": 0.301808, + "grad_norm": 1.21875, + "learning_rate": 7.038387096774193e-05, + "loss": 0.1824, + "step": 18863 + }, + { + "epoch": 0.301824, + "grad_norm": 0.69140625, + "learning_rate": 7.038225806451613e-05, + "loss": 0.1746, + "step": 18864 + }, + { + "epoch": 0.30184, + "grad_norm": 0.98828125, + "learning_rate": 7.038064516129033e-05, + "loss": 0.1869, + "step": 18865 + }, + { + "epoch": 0.301856, + "grad_norm": 0.84765625, + "learning_rate": 7.037903225806452e-05, + "loss": 0.1658, + "step": 18866 + }, + { + "epoch": 0.301872, + "grad_norm": 0.7109375, + "learning_rate": 7.037741935483872e-05, + "loss": 0.1715, + "step": 18867 + }, + { + "epoch": 0.301888, + "grad_norm": 0.6953125, + "learning_rate": 7.03758064516129e-05, + "loss": 0.1678, + "step": 18868 + }, + { + "epoch": 0.301904, + "grad_norm": 0.90234375, + "learning_rate": 7.03741935483871e-05, + "loss": 0.1675, + "step": 18869 + }, + { + "epoch": 0.30192, + "grad_norm": 0.5078125, + "learning_rate": 7.037258064516129e-05, + "loss": 0.1242, + "step": 18870 + }, + { + "epoch": 0.301936, + "grad_norm": 1.046875, + "learning_rate": 7.037096774193549e-05, + "loss": 0.1548, + "step": 18871 + }, + { + "epoch": 0.301952, + "grad_norm": 1.140625, + "learning_rate": 7.036935483870967e-05, + "loss": 0.191, + "step": 18872 + }, + { + "epoch": 0.301968, + "grad_norm": 0.87109375, + "learning_rate": 7.036774193548387e-05, + "loss": 0.1823, + "step": 18873 + }, + { + "epoch": 0.301984, + "grad_norm": 0.7421875, + "learning_rate": 7.036612903225806e-05, + "loss": 0.1429, + "step": 18874 + }, + { + "epoch": 0.302, + "grad_norm": 0.65625, + "learning_rate": 7.036451612903226e-05, + "loss": 0.1924, + "step": 18875 + }, + { + "epoch": 0.302016, + "grad_norm": 0.98828125, + "learning_rate": 7.036290322580646e-05, + "loss": 0.1557, + "step": 18876 + }, + { + "epoch": 0.302032, + "grad_norm": 0.494140625, + "learning_rate": 7.036129032258066e-05, + "loss": 0.1687, + "step": 18877 + }, + { + "epoch": 0.302048, + "grad_norm": 0.7578125, + "learning_rate": 7.035967741935485e-05, + "loss": 0.1525, + "step": 18878 + }, + { + "epoch": 0.302064, + "grad_norm": 0.80078125, + "learning_rate": 7.035806451612904e-05, + "loss": 0.1553, + "step": 18879 + }, + { + "epoch": 0.30208, + "grad_norm": 0.6875, + "learning_rate": 7.035645161290323e-05, + "loss": 0.2016, + "step": 18880 + }, + { + "epoch": 0.302096, + "grad_norm": 0.8359375, + "learning_rate": 7.035483870967742e-05, + "loss": 0.1388, + "step": 18881 + }, + { + "epoch": 0.302112, + "grad_norm": 0.87890625, + "learning_rate": 7.035322580645162e-05, + "loss": 0.1862, + "step": 18882 + }, + { + "epoch": 0.302128, + "grad_norm": 1.015625, + "learning_rate": 7.03516129032258e-05, + "loss": 0.149, + "step": 18883 + }, + { + "epoch": 0.302144, + "grad_norm": 0.82421875, + "learning_rate": 7.035e-05, + "loss": 0.168, + "step": 18884 + }, + { + "epoch": 0.30216, + "grad_norm": 1.015625, + "learning_rate": 7.034838709677419e-05, + "loss": 0.2043, + "step": 18885 + }, + { + "epoch": 0.302176, + "grad_norm": 0.84375, + "learning_rate": 7.034677419354839e-05, + "loss": 0.144, + "step": 18886 + }, + { + "epoch": 0.302192, + "grad_norm": 0.88671875, + "learning_rate": 7.034516129032259e-05, + "loss": 0.1578, + "step": 18887 + }, + { + "epoch": 0.302208, + "grad_norm": 2.15625, + "learning_rate": 7.034354838709677e-05, + "loss": 0.1674, + "step": 18888 + }, + { + "epoch": 0.302224, + "grad_norm": 0.63671875, + "learning_rate": 7.034193548387097e-05, + "loss": 0.1664, + "step": 18889 + }, + { + "epoch": 0.30224, + "grad_norm": 0.640625, + "learning_rate": 7.034032258064517e-05, + "loss": 0.1389, + "step": 18890 + }, + { + "epoch": 0.302256, + "grad_norm": 0.8046875, + "learning_rate": 7.033870967741936e-05, + "loss": 0.1517, + "step": 18891 + }, + { + "epoch": 0.302272, + "grad_norm": 0.7421875, + "learning_rate": 7.033709677419356e-05, + "loss": 0.1553, + "step": 18892 + }, + { + "epoch": 0.302288, + "grad_norm": 0.74609375, + "learning_rate": 7.033548387096774e-05, + "loss": 0.1505, + "step": 18893 + }, + { + "epoch": 0.302304, + "grad_norm": 1.1484375, + "learning_rate": 7.033387096774194e-05, + "loss": 0.2317, + "step": 18894 + }, + { + "epoch": 0.30232, + "grad_norm": 1.09375, + "learning_rate": 7.033225806451613e-05, + "loss": 0.1959, + "step": 18895 + }, + { + "epoch": 0.302336, + "grad_norm": 0.7265625, + "learning_rate": 7.033064516129032e-05, + "loss": 0.1627, + "step": 18896 + }, + { + "epoch": 0.302352, + "grad_norm": 1.1328125, + "learning_rate": 7.032903225806452e-05, + "loss": 0.1673, + "step": 18897 + }, + { + "epoch": 0.302368, + "grad_norm": 0.8125, + "learning_rate": 7.03274193548387e-05, + "loss": 0.1698, + "step": 18898 + }, + { + "epoch": 0.302384, + "grad_norm": 0.80859375, + "learning_rate": 7.03258064516129e-05, + "loss": 0.1467, + "step": 18899 + }, + { + "epoch": 0.3024, + "grad_norm": 0.65625, + "learning_rate": 7.03241935483871e-05, + "loss": 0.2017, + "step": 18900 + }, + { + "epoch": 0.302416, + "grad_norm": 1.03125, + "learning_rate": 7.03225806451613e-05, + "loss": 0.1766, + "step": 18901 + }, + { + "epoch": 0.302432, + "grad_norm": 0.58203125, + "learning_rate": 7.032096774193549e-05, + "loss": 0.1684, + "step": 18902 + }, + { + "epoch": 0.302448, + "grad_norm": 0.67578125, + "learning_rate": 7.031935483870969e-05, + "loss": 0.1932, + "step": 18903 + }, + { + "epoch": 0.302464, + "grad_norm": 1.0, + "learning_rate": 7.031774193548387e-05, + "loss": 0.1651, + "step": 18904 + }, + { + "epoch": 0.30248, + "grad_norm": 0.6484375, + "learning_rate": 7.031612903225807e-05, + "loss": 0.1744, + "step": 18905 + }, + { + "epoch": 0.302496, + "grad_norm": 0.75, + "learning_rate": 7.031451612903226e-05, + "loss": 0.1638, + "step": 18906 + }, + { + "epoch": 0.302512, + "grad_norm": 0.6796875, + "learning_rate": 7.031290322580646e-05, + "loss": 0.1679, + "step": 18907 + }, + { + "epoch": 0.302528, + "grad_norm": 0.625, + "learning_rate": 7.031129032258064e-05, + "loss": 0.1765, + "step": 18908 + }, + { + "epoch": 0.302544, + "grad_norm": 0.80859375, + "learning_rate": 7.030967741935484e-05, + "loss": 0.2087, + "step": 18909 + }, + { + "epoch": 0.30256, + "grad_norm": 1.1875, + "learning_rate": 7.030806451612903e-05, + "loss": 0.1856, + "step": 18910 + }, + { + "epoch": 0.302576, + "grad_norm": 0.50390625, + "learning_rate": 7.030645161290323e-05, + "loss": 0.1409, + "step": 18911 + }, + { + "epoch": 0.302592, + "grad_norm": 0.7265625, + "learning_rate": 7.030483870967743e-05, + "loss": 0.1793, + "step": 18912 + }, + { + "epoch": 0.302608, + "grad_norm": 1.0703125, + "learning_rate": 7.030322580645162e-05, + "loss": 0.175, + "step": 18913 + }, + { + "epoch": 0.302624, + "grad_norm": 0.70703125, + "learning_rate": 7.030161290322582e-05, + "loss": 0.1733, + "step": 18914 + }, + { + "epoch": 0.30264, + "grad_norm": 0.84375, + "learning_rate": 7.03e-05, + "loss": 0.182, + "step": 18915 + }, + { + "epoch": 0.302656, + "grad_norm": 0.67578125, + "learning_rate": 7.02983870967742e-05, + "loss": 0.1945, + "step": 18916 + }, + { + "epoch": 0.302672, + "grad_norm": 0.71484375, + "learning_rate": 7.029677419354839e-05, + "loss": 0.1829, + "step": 18917 + }, + { + "epoch": 0.302688, + "grad_norm": 0.91796875, + "learning_rate": 7.029516129032259e-05, + "loss": 0.1677, + "step": 18918 + }, + { + "epoch": 0.302704, + "grad_norm": 1.03125, + "learning_rate": 7.029354838709677e-05, + "loss": 0.1713, + "step": 18919 + }, + { + "epoch": 0.30272, + "grad_norm": 0.68359375, + "learning_rate": 7.029193548387097e-05, + "loss": 0.1729, + "step": 18920 + }, + { + "epoch": 0.302736, + "grad_norm": 0.89453125, + "learning_rate": 7.029032258064516e-05, + "loss": 0.2006, + "step": 18921 + }, + { + "epoch": 0.302752, + "grad_norm": 0.5546875, + "learning_rate": 7.028870967741936e-05, + "loss": 0.1474, + "step": 18922 + }, + { + "epoch": 0.302768, + "grad_norm": 0.91015625, + "learning_rate": 7.028709677419354e-05, + "loss": 0.1795, + "step": 18923 + }, + { + "epoch": 0.302784, + "grad_norm": 0.7265625, + "learning_rate": 7.028548387096774e-05, + "loss": 0.1511, + "step": 18924 + }, + { + "epoch": 0.3028, + "grad_norm": 0.81640625, + "learning_rate": 7.028387096774194e-05, + "loss": 0.2078, + "step": 18925 + }, + { + "epoch": 0.302816, + "grad_norm": 0.60546875, + "learning_rate": 7.028225806451614e-05, + "loss": 0.1619, + "step": 18926 + }, + { + "epoch": 0.302832, + "grad_norm": 0.734375, + "learning_rate": 7.028064516129033e-05, + "loss": 0.1642, + "step": 18927 + }, + { + "epoch": 0.302848, + "grad_norm": 0.859375, + "learning_rate": 7.027903225806452e-05, + "loss": 0.1786, + "step": 18928 + }, + { + "epoch": 0.302864, + "grad_norm": 0.69921875, + "learning_rate": 7.027741935483871e-05, + "loss": 0.1589, + "step": 18929 + }, + { + "epoch": 0.30288, + "grad_norm": 0.64453125, + "learning_rate": 7.02758064516129e-05, + "loss": 0.1634, + "step": 18930 + }, + { + "epoch": 0.302896, + "grad_norm": 0.58984375, + "learning_rate": 7.02741935483871e-05, + "loss": 0.1924, + "step": 18931 + }, + { + "epoch": 0.302912, + "grad_norm": 1.09375, + "learning_rate": 7.027258064516129e-05, + "loss": 0.1664, + "step": 18932 + }, + { + "epoch": 0.302928, + "grad_norm": 0.6953125, + "learning_rate": 7.027096774193549e-05, + "loss": 0.1966, + "step": 18933 + }, + { + "epoch": 0.302944, + "grad_norm": 0.53125, + "learning_rate": 7.026935483870967e-05, + "loss": 0.1341, + "step": 18934 + }, + { + "epoch": 0.30296, + "grad_norm": 0.640625, + "learning_rate": 7.026774193548387e-05, + "loss": 0.1697, + "step": 18935 + }, + { + "epoch": 0.302976, + "grad_norm": 0.58984375, + "learning_rate": 7.026612903225807e-05, + "loss": 0.1523, + "step": 18936 + }, + { + "epoch": 0.302992, + "grad_norm": 0.83203125, + "learning_rate": 7.026451612903227e-05, + "loss": 0.1786, + "step": 18937 + }, + { + "epoch": 0.303008, + "grad_norm": 1.0234375, + "learning_rate": 7.026290322580646e-05, + "loss": 0.192, + "step": 18938 + }, + { + "epoch": 0.303024, + "grad_norm": 0.6875, + "learning_rate": 7.026129032258066e-05, + "loss": 0.1465, + "step": 18939 + }, + { + "epoch": 0.30304, + "grad_norm": 0.80078125, + "learning_rate": 7.025967741935484e-05, + "loss": 0.1554, + "step": 18940 + }, + { + "epoch": 0.303056, + "grad_norm": 0.57421875, + "learning_rate": 7.025806451612904e-05, + "loss": 0.1869, + "step": 18941 + }, + { + "epoch": 0.303072, + "grad_norm": 0.69921875, + "learning_rate": 7.025645161290323e-05, + "loss": 0.1595, + "step": 18942 + }, + { + "epoch": 0.303088, + "grad_norm": 0.765625, + "learning_rate": 7.025483870967741e-05, + "loss": 0.1817, + "step": 18943 + }, + { + "epoch": 0.303104, + "grad_norm": 0.7890625, + "learning_rate": 7.025322580645161e-05, + "loss": 0.2022, + "step": 18944 + }, + { + "epoch": 0.30312, + "grad_norm": 0.609375, + "learning_rate": 7.02516129032258e-05, + "loss": 0.1746, + "step": 18945 + }, + { + "epoch": 0.303136, + "grad_norm": 0.84765625, + "learning_rate": 7.025e-05, + "loss": 0.1333, + "step": 18946 + }, + { + "epoch": 0.303152, + "grad_norm": 0.6875, + "learning_rate": 7.02483870967742e-05, + "loss": 0.1568, + "step": 18947 + }, + { + "epoch": 0.303168, + "grad_norm": 0.7265625, + "learning_rate": 7.02467741935484e-05, + "loss": 0.144, + "step": 18948 + }, + { + "epoch": 0.303184, + "grad_norm": 0.80078125, + "learning_rate": 7.024516129032259e-05, + "loss": 0.1853, + "step": 18949 + }, + { + "epoch": 0.3032, + "grad_norm": 0.90625, + "learning_rate": 7.024354838709678e-05, + "loss": 0.1484, + "step": 18950 + }, + { + "epoch": 0.303216, + "grad_norm": 0.78125, + "learning_rate": 7.024193548387097e-05, + "loss": 0.1425, + "step": 18951 + }, + { + "epoch": 0.303232, + "grad_norm": 0.859375, + "learning_rate": 7.024032258064517e-05, + "loss": 0.164, + "step": 18952 + }, + { + "epoch": 0.303248, + "grad_norm": 0.7890625, + "learning_rate": 7.023870967741936e-05, + "loss": 0.1611, + "step": 18953 + }, + { + "epoch": 0.303264, + "grad_norm": 0.9296875, + "learning_rate": 7.023709677419356e-05, + "loss": 0.1649, + "step": 18954 + }, + { + "epoch": 0.30328, + "grad_norm": 1.1796875, + "learning_rate": 7.023548387096774e-05, + "loss": 0.1797, + "step": 18955 + }, + { + "epoch": 0.303296, + "grad_norm": 0.6484375, + "learning_rate": 7.023387096774194e-05, + "loss": 0.1683, + "step": 18956 + }, + { + "epoch": 0.303312, + "grad_norm": 0.62109375, + "learning_rate": 7.023225806451613e-05, + "loss": 0.1235, + "step": 18957 + }, + { + "epoch": 0.303328, + "grad_norm": 0.69140625, + "learning_rate": 7.023064516129031e-05, + "loss": 0.1367, + "step": 18958 + }, + { + "epoch": 0.303344, + "grad_norm": 0.765625, + "learning_rate": 7.022903225806451e-05, + "loss": 0.1965, + "step": 18959 + }, + { + "epoch": 0.30336, + "grad_norm": 0.8828125, + "learning_rate": 7.022741935483871e-05, + "loss": 0.1731, + "step": 18960 + }, + { + "epoch": 0.303376, + "grad_norm": 0.71484375, + "learning_rate": 7.022580645161291e-05, + "loss": 0.1738, + "step": 18961 + }, + { + "epoch": 0.303392, + "grad_norm": 1.21875, + "learning_rate": 7.02241935483871e-05, + "loss": 0.2104, + "step": 18962 + }, + { + "epoch": 0.303408, + "grad_norm": 0.8671875, + "learning_rate": 7.02225806451613e-05, + "loss": 0.193, + "step": 18963 + }, + { + "epoch": 0.303424, + "grad_norm": 0.9140625, + "learning_rate": 7.022096774193548e-05, + "loss": 0.2026, + "step": 18964 + }, + { + "epoch": 0.30344, + "grad_norm": 1.1953125, + "learning_rate": 7.021935483870968e-05, + "loss": 0.1935, + "step": 18965 + }, + { + "epoch": 0.303456, + "grad_norm": 0.8671875, + "learning_rate": 7.021774193548387e-05, + "loss": 0.1941, + "step": 18966 + }, + { + "epoch": 0.303472, + "grad_norm": 0.74609375, + "learning_rate": 7.021612903225807e-05, + "loss": 0.1647, + "step": 18967 + }, + { + "epoch": 0.303488, + "grad_norm": 0.69140625, + "learning_rate": 7.021451612903226e-05, + "loss": 0.1158, + "step": 18968 + }, + { + "epoch": 0.303504, + "grad_norm": 1.171875, + "learning_rate": 7.021290322580646e-05, + "loss": 0.1568, + "step": 18969 + }, + { + "epoch": 0.30352, + "grad_norm": 0.546875, + "learning_rate": 7.021129032258064e-05, + "loss": 0.1386, + "step": 18970 + }, + { + "epoch": 0.303536, + "grad_norm": 0.67578125, + "learning_rate": 7.020967741935484e-05, + "loss": 0.1847, + "step": 18971 + }, + { + "epoch": 0.303552, + "grad_norm": 1.15625, + "learning_rate": 7.020806451612904e-05, + "loss": 0.1425, + "step": 18972 + }, + { + "epoch": 0.303568, + "grad_norm": 0.71484375, + "learning_rate": 7.020645161290324e-05, + "loss": 0.1404, + "step": 18973 + }, + { + "epoch": 0.303584, + "grad_norm": 0.8671875, + "learning_rate": 7.020483870967743e-05, + "loss": 0.1852, + "step": 18974 + }, + { + "epoch": 0.3036, + "grad_norm": 0.609375, + "learning_rate": 7.020322580645161e-05, + "loss": 0.1633, + "step": 18975 + }, + { + "epoch": 0.303616, + "grad_norm": 0.87109375, + "learning_rate": 7.020161290322581e-05, + "loss": 0.1741, + "step": 18976 + }, + { + "epoch": 0.303632, + "grad_norm": 0.55078125, + "learning_rate": 7.02e-05, + "loss": 0.1588, + "step": 18977 + }, + { + "epoch": 0.303648, + "grad_norm": 1.0703125, + "learning_rate": 7.01983870967742e-05, + "loss": 0.1805, + "step": 18978 + }, + { + "epoch": 0.303664, + "grad_norm": 0.6953125, + "learning_rate": 7.019677419354838e-05, + "loss": 0.1696, + "step": 18979 + }, + { + "epoch": 0.30368, + "grad_norm": 0.9453125, + "learning_rate": 7.019516129032258e-05, + "loss": 0.1768, + "step": 18980 + }, + { + "epoch": 0.303696, + "grad_norm": 0.92578125, + "learning_rate": 7.019354838709677e-05, + "loss": 0.1304, + "step": 18981 + }, + { + "epoch": 0.303712, + "grad_norm": 1.2578125, + "learning_rate": 7.019193548387097e-05, + "loss": 0.1447, + "step": 18982 + }, + { + "epoch": 0.303728, + "grad_norm": 0.81640625, + "learning_rate": 7.019032258064516e-05, + "loss": 0.1552, + "step": 18983 + }, + { + "epoch": 0.303744, + "grad_norm": 0.7578125, + "learning_rate": 7.018870967741936e-05, + "loss": 0.181, + "step": 18984 + }, + { + "epoch": 0.30376, + "grad_norm": 0.80859375, + "learning_rate": 7.018709677419356e-05, + "loss": 0.1534, + "step": 18985 + }, + { + "epoch": 0.303776, + "grad_norm": 0.66796875, + "learning_rate": 7.018548387096775e-05, + "loss": 0.1581, + "step": 18986 + }, + { + "epoch": 0.303792, + "grad_norm": 0.7109375, + "learning_rate": 7.018387096774194e-05, + "loss": 0.1649, + "step": 18987 + }, + { + "epoch": 0.303808, + "grad_norm": 0.671875, + "learning_rate": 7.018225806451614e-05, + "loss": 0.1206, + "step": 18988 + }, + { + "epoch": 0.303824, + "grad_norm": 0.76171875, + "learning_rate": 7.018064516129033e-05, + "loss": 0.1927, + "step": 18989 + }, + { + "epoch": 0.30384, + "grad_norm": 0.7578125, + "learning_rate": 7.017903225806451e-05, + "loss": 0.1571, + "step": 18990 + }, + { + "epoch": 0.303856, + "grad_norm": 0.7578125, + "learning_rate": 7.017741935483871e-05, + "loss": 0.1403, + "step": 18991 + }, + { + "epoch": 0.303872, + "grad_norm": 0.67578125, + "learning_rate": 7.01758064516129e-05, + "loss": 0.1663, + "step": 18992 + }, + { + "epoch": 0.303888, + "grad_norm": 1.1875, + "learning_rate": 7.01741935483871e-05, + "loss": 0.2016, + "step": 18993 + }, + { + "epoch": 0.303904, + "grad_norm": 0.94921875, + "learning_rate": 7.017258064516128e-05, + "loss": 0.2085, + "step": 18994 + }, + { + "epoch": 0.30392, + "grad_norm": 0.796875, + "learning_rate": 7.017096774193548e-05, + "loss": 0.1562, + "step": 18995 + }, + { + "epoch": 0.303936, + "grad_norm": 0.84765625, + "learning_rate": 7.016935483870968e-05, + "loss": 0.1443, + "step": 18996 + }, + { + "epoch": 0.303952, + "grad_norm": 0.7265625, + "learning_rate": 7.016774193548388e-05, + "loss": 0.1784, + "step": 18997 + }, + { + "epoch": 0.303968, + "grad_norm": 0.86328125, + "learning_rate": 7.016612903225807e-05, + "loss": 0.1711, + "step": 18998 + }, + { + "epoch": 0.303984, + "grad_norm": 1.0234375, + "learning_rate": 7.016451612903227e-05, + "loss": 0.1747, + "step": 18999 + }, + { + "epoch": 0.304, + "grad_norm": 1.078125, + "learning_rate": 7.016290322580645e-05, + "loss": 0.1757, + "step": 19000 + }, + { + "epoch": 0.304016, + "grad_norm": 1.671875, + "learning_rate": 7.016129032258065e-05, + "loss": 0.1716, + "step": 19001 + }, + { + "epoch": 0.304032, + "grad_norm": 1.0234375, + "learning_rate": 7.015967741935484e-05, + "loss": 0.1543, + "step": 19002 + }, + { + "epoch": 0.304048, + "grad_norm": 1.125, + "learning_rate": 7.015806451612904e-05, + "loss": 0.1727, + "step": 19003 + }, + { + "epoch": 0.304064, + "grad_norm": 0.71484375, + "learning_rate": 7.015645161290323e-05, + "loss": 0.1482, + "step": 19004 + }, + { + "epoch": 0.30408, + "grad_norm": 1.0, + "learning_rate": 7.015483870967741e-05, + "loss": 0.1886, + "step": 19005 + }, + { + "epoch": 0.304096, + "grad_norm": 0.96484375, + "learning_rate": 7.015322580645161e-05, + "loss": 0.1774, + "step": 19006 + }, + { + "epoch": 0.304112, + "grad_norm": 0.9453125, + "learning_rate": 7.015161290322581e-05, + "loss": 0.1996, + "step": 19007 + }, + { + "epoch": 0.304128, + "grad_norm": 0.82421875, + "learning_rate": 7.015000000000001e-05, + "loss": 0.1897, + "step": 19008 + }, + { + "epoch": 0.304144, + "grad_norm": 0.703125, + "learning_rate": 7.01483870967742e-05, + "loss": 0.1718, + "step": 19009 + }, + { + "epoch": 0.30416, + "grad_norm": 1.0546875, + "learning_rate": 7.01467741935484e-05, + "loss": 0.1598, + "step": 19010 + }, + { + "epoch": 0.304176, + "grad_norm": 0.515625, + "learning_rate": 7.014516129032258e-05, + "loss": 0.1414, + "step": 19011 + }, + { + "epoch": 0.304192, + "grad_norm": 0.85546875, + "learning_rate": 7.014354838709678e-05, + "loss": 0.1527, + "step": 19012 + }, + { + "epoch": 0.304208, + "grad_norm": 0.765625, + "learning_rate": 7.014193548387097e-05, + "loss": 0.2048, + "step": 19013 + }, + { + "epoch": 0.304224, + "grad_norm": 0.77734375, + "learning_rate": 7.014032258064517e-05, + "loss": 0.163, + "step": 19014 + }, + { + "epoch": 0.30424, + "grad_norm": 0.609375, + "learning_rate": 7.013870967741935e-05, + "loss": 0.1462, + "step": 19015 + }, + { + "epoch": 0.304256, + "grad_norm": 0.734375, + "learning_rate": 7.013709677419355e-05, + "loss": 0.1614, + "step": 19016 + }, + { + "epoch": 0.304272, + "grad_norm": 0.515625, + "learning_rate": 7.013548387096774e-05, + "loss": 0.146, + "step": 19017 + }, + { + "epoch": 0.304288, + "grad_norm": 0.6484375, + "learning_rate": 7.013387096774194e-05, + "loss": 0.1778, + "step": 19018 + }, + { + "epoch": 0.304304, + "grad_norm": 0.80859375, + "learning_rate": 7.013225806451613e-05, + "loss": 0.1644, + "step": 19019 + }, + { + "epoch": 0.30432, + "grad_norm": 1.296875, + "learning_rate": 7.013064516129033e-05, + "loss": 0.1743, + "step": 19020 + }, + { + "epoch": 0.304336, + "grad_norm": 0.87890625, + "learning_rate": 7.012903225806452e-05, + "loss": 0.1638, + "step": 19021 + }, + { + "epoch": 0.304352, + "grad_norm": 0.859375, + "learning_rate": 7.012741935483871e-05, + "loss": 0.1689, + "step": 19022 + }, + { + "epoch": 0.304368, + "grad_norm": 0.921875, + "learning_rate": 7.012580645161291e-05, + "loss": 0.2028, + "step": 19023 + }, + { + "epoch": 0.304384, + "grad_norm": 0.54296875, + "learning_rate": 7.01241935483871e-05, + "loss": 0.1659, + "step": 19024 + }, + { + "epoch": 0.3044, + "grad_norm": 0.78125, + "learning_rate": 7.01225806451613e-05, + "loss": 0.1885, + "step": 19025 + }, + { + "epoch": 0.304416, + "grad_norm": 0.6328125, + "learning_rate": 7.012096774193548e-05, + "loss": 0.1436, + "step": 19026 + }, + { + "epoch": 0.304432, + "grad_norm": 0.92578125, + "learning_rate": 7.011935483870968e-05, + "loss": 0.1434, + "step": 19027 + }, + { + "epoch": 0.304448, + "grad_norm": 0.71484375, + "learning_rate": 7.011774193548387e-05, + "loss": 0.1696, + "step": 19028 + }, + { + "epoch": 0.304464, + "grad_norm": 0.80078125, + "learning_rate": 7.011612903225807e-05, + "loss": 0.1885, + "step": 19029 + }, + { + "epoch": 0.30448, + "grad_norm": 0.85546875, + "learning_rate": 7.011451612903225e-05, + "loss": 0.1817, + "step": 19030 + }, + { + "epoch": 0.304496, + "grad_norm": 0.984375, + "learning_rate": 7.011290322580645e-05, + "loss": 0.1391, + "step": 19031 + }, + { + "epoch": 0.304512, + "grad_norm": 0.72265625, + "learning_rate": 7.011129032258065e-05, + "loss": 0.2032, + "step": 19032 + }, + { + "epoch": 0.304528, + "grad_norm": 0.7421875, + "learning_rate": 7.010967741935485e-05, + "loss": 0.137, + "step": 19033 + }, + { + "epoch": 0.304544, + "grad_norm": 1.0234375, + "learning_rate": 7.010806451612904e-05, + "loss": 0.1861, + "step": 19034 + }, + { + "epoch": 0.30456, + "grad_norm": 0.76953125, + "learning_rate": 7.010645161290324e-05, + "loss": 0.2026, + "step": 19035 + }, + { + "epoch": 0.304576, + "grad_norm": 0.5859375, + "learning_rate": 7.010483870967742e-05, + "loss": 0.1559, + "step": 19036 + }, + { + "epoch": 0.304592, + "grad_norm": 0.828125, + "learning_rate": 7.010322580645161e-05, + "loss": 0.1815, + "step": 19037 + }, + { + "epoch": 0.304608, + "grad_norm": 0.5390625, + "learning_rate": 7.010161290322581e-05, + "loss": 0.1469, + "step": 19038 + }, + { + "epoch": 0.304624, + "grad_norm": 0.75, + "learning_rate": 7.01e-05, + "loss": 0.1466, + "step": 19039 + }, + { + "epoch": 0.30464, + "grad_norm": 0.609375, + "learning_rate": 7.00983870967742e-05, + "loss": 0.1335, + "step": 19040 + }, + { + "epoch": 0.304656, + "grad_norm": 1.3671875, + "learning_rate": 7.009677419354838e-05, + "loss": 0.1539, + "step": 19041 + }, + { + "epoch": 0.304672, + "grad_norm": 0.82421875, + "learning_rate": 7.009516129032258e-05, + "loss": 0.1637, + "step": 19042 + }, + { + "epoch": 0.304688, + "grad_norm": 0.95703125, + "learning_rate": 7.009354838709678e-05, + "loss": 0.1668, + "step": 19043 + }, + { + "epoch": 0.304704, + "grad_norm": 0.65234375, + "learning_rate": 7.009193548387097e-05, + "loss": 0.1574, + "step": 19044 + }, + { + "epoch": 0.30472, + "grad_norm": 0.69140625, + "learning_rate": 7.009032258064517e-05, + "loss": 0.1784, + "step": 19045 + }, + { + "epoch": 0.304736, + "grad_norm": 0.80078125, + "learning_rate": 7.008870967741937e-05, + "loss": 0.1644, + "step": 19046 + }, + { + "epoch": 0.304752, + "grad_norm": 0.90234375, + "learning_rate": 7.008709677419355e-05, + "loss": 0.1682, + "step": 19047 + }, + { + "epoch": 0.304768, + "grad_norm": 0.73046875, + "learning_rate": 7.008548387096775e-05, + "loss": 0.1338, + "step": 19048 + }, + { + "epoch": 0.304784, + "grad_norm": 1.3515625, + "learning_rate": 7.008387096774194e-05, + "loss": 0.1947, + "step": 19049 + }, + { + "epoch": 0.3048, + "grad_norm": 0.6484375, + "learning_rate": 7.008225806451614e-05, + "loss": 0.1732, + "step": 19050 + }, + { + "epoch": 0.304816, + "grad_norm": 0.78515625, + "learning_rate": 7.008064516129032e-05, + "loss": 0.2107, + "step": 19051 + }, + { + "epoch": 0.304832, + "grad_norm": 1.2734375, + "learning_rate": 7.007903225806451e-05, + "loss": 0.2051, + "step": 19052 + }, + { + "epoch": 0.304848, + "grad_norm": 1.0, + "learning_rate": 7.007741935483871e-05, + "loss": 0.1927, + "step": 19053 + }, + { + "epoch": 0.304864, + "grad_norm": 0.921875, + "learning_rate": 7.00758064516129e-05, + "loss": 0.175, + "step": 19054 + }, + { + "epoch": 0.30488, + "grad_norm": 0.81640625, + "learning_rate": 7.00741935483871e-05, + "loss": 0.1885, + "step": 19055 + }, + { + "epoch": 0.304896, + "grad_norm": 1.046875, + "learning_rate": 7.00725806451613e-05, + "loss": 0.1668, + "step": 19056 + }, + { + "epoch": 0.304912, + "grad_norm": 0.8203125, + "learning_rate": 7.00709677419355e-05, + "loss": 0.182, + "step": 19057 + }, + { + "epoch": 0.304928, + "grad_norm": 0.62890625, + "learning_rate": 7.006935483870968e-05, + "loss": 0.1754, + "step": 19058 + }, + { + "epoch": 0.304944, + "grad_norm": 0.63671875, + "learning_rate": 7.006774193548388e-05, + "loss": 0.135, + "step": 19059 + }, + { + "epoch": 0.30496, + "grad_norm": 1.2265625, + "learning_rate": 7.006612903225807e-05, + "loss": 0.1669, + "step": 19060 + }, + { + "epoch": 0.304976, + "grad_norm": 1.453125, + "learning_rate": 7.006451612903227e-05, + "loss": 0.1778, + "step": 19061 + }, + { + "epoch": 0.304992, + "grad_norm": 1.234375, + "learning_rate": 7.006290322580645e-05, + "loss": 0.1503, + "step": 19062 + }, + { + "epoch": 0.305008, + "grad_norm": 1.1015625, + "learning_rate": 7.006129032258065e-05, + "loss": 0.2021, + "step": 19063 + }, + { + "epoch": 0.305024, + "grad_norm": 0.9453125, + "learning_rate": 7.005967741935484e-05, + "loss": 0.1795, + "step": 19064 + }, + { + "epoch": 0.30504, + "grad_norm": 0.8359375, + "learning_rate": 7.005806451612904e-05, + "loss": 0.154, + "step": 19065 + }, + { + "epoch": 0.305056, + "grad_norm": 1.28125, + "learning_rate": 7.005645161290322e-05, + "loss": 0.1527, + "step": 19066 + }, + { + "epoch": 0.305072, + "grad_norm": 0.68359375, + "learning_rate": 7.005483870967742e-05, + "loss": 0.1882, + "step": 19067 + }, + { + "epoch": 0.305088, + "grad_norm": 0.7734375, + "learning_rate": 7.005322580645162e-05, + "loss": 0.1333, + "step": 19068 + }, + { + "epoch": 0.305104, + "grad_norm": 0.78515625, + "learning_rate": 7.005161290322581e-05, + "loss": 0.1161, + "step": 19069 + }, + { + "epoch": 0.30512, + "grad_norm": 0.5546875, + "learning_rate": 7.005000000000001e-05, + "loss": 0.2015, + "step": 19070 + }, + { + "epoch": 0.305136, + "grad_norm": 0.89453125, + "learning_rate": 7.00483870967742e-05, + "loss": 0.1523, + "step": 19071 + }, + { + "epoch": 0.305152, + "grad_norm": 0.5546875, + "learning_rate": 7.00467741935484e-05, + "loss": 0.1461, + "step": 19072 + }, + { + "epoch": 0.305168, + "grad_norm": 0.8359375, + "learning_rate": 7.004516129032258e-05, + "loss": 0.1608, + "step": 19073 + }, + { + "epoch": 0.305184, + "grad_norm": 0.84765625, + "learning_rate": 7.004354838709678e-05, + "loss": 0.1896, + "step": 19074 + }, + { + "epoch": 0.3052, + "grad_norm": 0.703125, + "learning_rate": 7.004193548387097e-05, + "loss": 0.1608, + "step": 19075 + }, + { + "epoch": 0.305216, + "grad_norm": 1.1328125, + "learning_rate": 7.004032258064517e-05, + "loss": 0.1727, + "step": 19076 + }, + { + "epoch": 0.305232, + "grad_norm": 0.6328125, + "learning_rate": 7.003870967741935e-05, + "loss": 0.1567, + "step": 19077 + }, + { + "epoch": 0.305248, + "grad_norm": 0.7578125, + "learning_rate": 7.003709677419355e-05, + "loss": 0.1613, + "step": 19078 + }, + { + "epoch": 0.305264, + "grad_norm": 1.125, + "learning_rate": 7.003548387096774e-05, + "loss": 0.1534, + "step": 19079 + }, + { + "epoch": 0.30528, + "grad_norm": 0.5625, + "learning_rate": 7.003387096774194e-05, + "loss": 0.177, + "step": 19080 + }, + { + "epoch": 0.305296, + "grad_norm": 0.8046875, + "learning_rate": 7.003225806451614e-05, + "loss": 0.1489, + "step": 19081 + }, + { + "epoch": 0.305312, + "grad_norm": 0.6796875, + "learning_rate": 7.003064516129034e-05, + "loss": 0.1608, + "step": 19082 + }, + { + "epoch": 0.305328, + "grad_norm": 1.6796875, + "learning_rate": 7.002903225806452e-05, + "loss": 0.1767, + "step": 19083 + }, + { + "epoch": 0.305344, + "grad_norm": 0.77734375, + "learning_rate": 7.002741935483871e-05, + "loss": 0.1938, + "step": 19084 + }, + { + "epoch": 0.30536, + "grad_norm": 0.6640625, + "learning_rate": 7.002580645161291e-05, + "loss": 0.2002, + "step": 19085 + }, + { + "epoch": 0.305376, + "grad_norm": 0.76171875, + "learning_rate": 7.00241935483871e-05, + "loss": 0.1795, + "step": 19086 + }, + { + "epoch": 0.305392, + "grad_norm": 0.828125, + "learning_rate": 7.00225806451613e-05, + "loss": 0.1343, + "step": 19087 + }, + { + "epoch": 0.305408, + "grad_norm": 0.90234375, + "learning_rate": 7.002096774193548e-05, + "loss": 0.2192, + "step": 19088 + }, + { + "epoch": 0.305424, + "grad_norm": 0.890625, + "learning_rate": 7.001935483870968e-05, + "loss": 0.2201, + "step": 19089 + }, + { + "epoch": 0.30544, + "grad_norm": 1.0625, + "learning_rate": 7.001774193548387e-05, + "loss": 0.2003, + "step": 19090 + }, + { + "epoch": 0.305456, + "grad_norm": 0.7890625, + "learning_rate": 7.001612903225807e-05, + "loss": 0.1911, + "step": 19091 + }, + { + "epoch": 0.305472, + "grad_norm": 0.67578125, + "learning_rate": 7.001451612903226e-05, + "loss": 0.2131, + "step": 19092 + }, + { + "epoch": 0.305488, + "grad_norm": 1.0546875, + "learning_rate": 7.001290322580646e-05, + "loss": 0.1697, + "step": 19093 + }, + { + "epoch": 0.305504, + "grad_norm": 0.68359375, + "learning_rate": 7.001129032258065e-05, + "loss": 0.1644, + "step": 19094 + }, + { + "epoch": 0.30552, + "grad_norm": 0.7578125, + "learning_rate": 7.000967741935485e-05, + "loss": 0.1728, + "step": 19095 + }, + { + "epoch": 0.305536, + "grad_norm": 1.0234375, + "learning_rate": 7.000806451612904e-05, + "loss": 0.1464, + "step": 19096 + }, + { + "epoch": 0.305552, + "grad_norm": 0.99609375, + "learning_rate": 7.000645161290324e-05, + "loss": 0.2457, + "step": 19097 + }, + { + "epoch": 0.305568, + "grad_norm": 0.6953125, + "learning_rate": 7.000483870967742e-05, + "loss": 0.1648, + "step": 19098 + }, + { + "epoch": 0.305584, + "grad_norm": 0.78515625, + "learning_rate": 7.000322580645161e-05, + "loss": 0.1661, + "step": 19099 + }, + { + "epoch": 0.3056, + "grad_norm": 0.73046875, + "learning_rate": 7.000161290322581e-05, + "loss": 0.1694, + "step": 19100 + }, + { + "epoch": 0.305616, + "grad_norm": 1.78125, + "learning_rate": 7e-05, + "loss": 0.164, + "step": 19101 + }, + { + "epoch": 0.305632, + "grad_norm": 0.7109375, + "learning_rate": 6.99983870967742e-05, + "loss": 0.1651, + "step": 19102 + }, + { + "epoch": 0.305648, + "grad_norm": 1.2578125, + "learning_rate": 6.999677419354839e-05, + "loss": 0.2091, + "step": 19103 + }, + { + "epoch": 0.305664, + "grad_norm": 1.3671875, + "learning_rate": 6.999516129032259e-05, + "loss": 0.1324, + "step": 19104 + }, + { + "epoch": 0.30568, + "grad_norm": 0.85546875, + "learning_rate": 6.999354838709678e-05, + "loss": 0.1367, + "step": 19105 + }, + { + "epoch": 0.305696, + "grad_norm": 1.3984375, + "learning_rate": 6.999193548387098e-05, + "loss": 0.1829, + "step": 19106 + }, + { + "epoch": 0.305712, + "grad_norm": 1.125, + "learning_rate": 6.999032258064516e-05, + "loss": 0.2002, + "step": 19107 + }, + { + "epoch": 0.305728, + "grad_norm": 0.73046875, + "learning_rate": 6.998870967741936e-05, + "loss": 0.1779, + "step": 19108 + }, + { + "epoch": 0.305744, + "grad_norm": 1.078125, + "learning_rate": 6.998709677419355e-05, + "loss": 0.1433, + "step": 19109 + }, + { + "epoch": 0.30576, + "grad_norm": 0.7890625, + "learning_rate": 6.998548387096775e-05, + "loss": 0.1597, + "step": 19110 + }, + { + "epoch": 0.305776, + "grad_norm": 0.921875, + "learning_rate": 6.998387096774194e-05, + "loss": 0.1573, + "step": 19111 + }, + { + "epoch": 0.305792, + "grad_norm": 0.6953125, + "learning_rate": 6.998225806451614e-05, + "loss": 0.1974, + "step": 19112 + }, + { + "epoch": 0.305808, + "grad_norm": 0.8046875, + "learning_rate": 6.998064516129032e-05, + "loss": 0.1643, + "step": 19113 + }, + { + "epoch": 0.305824, + "grad_norm": 0.92578125, + "learning_rate": 6.997903225806451e-05, + "loss": 0.1946, + "step": 19114 + }, + { + "epoch": 0.30584, + "grad_norm": 1.203125, + "learning_rate": 6.997741935483871e-05, + "loss": 0.1623, + "step": 19115 + }, + { + "epoch": 0.305856, + "grad_norm": 0.66796875, + "learning_rate": 6.997580645161291e-05, + "loss": 0.2144, + "step": 19116 + }, + { + "epoch": 0.305872, + "grad_norm": 0.80078125, + "learning_rate": 6.99741935483871e-05, + "loss": 0.1831, + "step": 19117 + }, + { + "epoch": 0.305888, + "grad_norm": 0.6796875, + "learning_rate": 6.997258064516129e-05, + "loss": 0.1563, + "step": 19118 + }, + { + "epoch": 0.305904, + "grad_norm": 0.60546875, + "learning_rate": 6.997096774193549e-05, + "loss": 0.1771, + "step": 19119 + }, + { + "epoch": 0.30592, + "grad_norm": 0.578125, + "learning_rate": 6.996935483870968e-05, + "loss": 0.1586, + "step": 19120 + }, + { + "epoch": 0.305936, + "grad_norm": 1.1328125, + "learning_rate": 6.996774193548388e-05, + "loss": 0.1837, + "step": 19121 + }, + { + "epoch": 0.305952, + "grad_norm": 0.73828125, + "learning_rate": 6.996612903225806e-05, + "loss": 0.1676, + "step": 19122 + }, + { + "epoch": 0.305968, + "grad_norm": 0.86328125, + "learning_rate": 6.996451612903226e-05, + "loss": 0.1808, + "step": 19123 + }, + { + "epoch": 0.305984, + "grad_norm": 0.59765625, + "learning_rate": 6.996290322580645e-05, + "loss": 0.1696, + "step": 19124 + }, + { + "epoch": 0.306, + "grad_norm": 1.5078125, + "learning_rate": 6.996129032258065e-05, + "loss": 0.1618, + "step": 19125 + }, + { + "epoch": 0.306016, + "grad_norm": 0.7734375, + "learning_rate": 6.995967741935484e-05, + "loss": 0.1547, + "step": 19126 + }, + { + "epoch": 0.306032, + "grad_norm": 0.828125, + "learning_rate": 6.995806451612904e-05, + "loss": 0.1634, + "step": 19127 + }, + { + "epoch": 0.306048, + "grad_norm": 0.8359375, + "learning_rate": 6.995645161290323e-05, + "loss": 0.2075, + "step": 19128 + }, + { + "epoch": 0.306064, + "grad_norm": 1.3203125, + "learning_rate": 6.995483870967743e-05, + "loss": 0.2018, + "step": 19129 + }, + { + "epoch": 0.30608, + "grad_norm": 1.1953125, + "learning_rate": 6.995322580645162e-05, + "loss": 0.1764, + "step": 19130 + }, + { + "epoch": 0.306096, + "grad_norm": 0.60546875, + "learning_rate": 6.99516129032258e-05, + "loss": 0.1864, + "step": 19131 + }, + { + "epoch": 0.306112, + "grad_norm": 0.8046875, + "learning_rate": 6.995e-05, + "loss": 0.1664, + "step": 19132 + }, + { + "epoch": 0.306128, + "grad_norm": 0.59375, + "learning_rate": 6.994838709677419e-05, + "loss": 0.18, + "step": 19133 + }, + { + "epoch": 0.306144, + "grad_norm": 0.5859375, + "learning_rate": 6.994677419354839e-05, + "loss": 0.111, + "step": 19134 + }, + { + "epoch": 0.30616, + "grad_norm": 0.8828125, + "learning_rate": 6.994516129032258e-05, + "loss": 0.1684, + "step": 19135 + }, + { + "epoch": 0.306176, + "grad_norm": 0.734375, + "learning_rate": 6.994354838709678e-05, + "loss": 0.1561, + "step": 19136 + }, + { + "epoch": 0.306192, + "grad_norm": 1.34375, + "learning_rate": 6.994193548387096e-05, + "loss": 0.184, + "step": 19137 + }, + { + "epoch": 0.306208, + "grad_norm": 0.6640625, + "learning_rate": 6.994032258064516e-05, + "loss": 0.1539, + "step": 19138 + }, + { + "epoch": 0.306224, + "grad_norm": 0.67578125, + "learning_rate": 6.993870967741936e-05, + "loss": 0.1744, + "step": 19139 + }, + { + "epoch": 0.30624, + "grad_norm": 0.88671875, + "learning_rate": 6.993709677419355e-05, + "loss": 0.1744, + "step": 19140 + }, + { + "epoch": 0.306256, + "grad_norm": 0.58203125, + "learning_rate": 6.993548387096775e-05, + "loss": 0.1346, + "step": 19141 + }, + { + "epoch": 0.306272, + "grad_norm": 0.7109375, + "learning_rate": 6.993387096774195e-05, + "loss": 0.1676, + "step": 19142 + }, + { + "epoch": 0.306288, + "grad_norm": 0.99609375, + "learning_rate": 6.993225806451613e-05, + "loss": 0.1366, + "step": 19143 + }, + { + "epoch": 0.306304, + "grad_norm": 1.2265625, + "learning_rate": 6.993064516129033e-05, + "loss": 0.1655, + "step": 19144 + }, + { + "epoch": 0.30632, + "grad_norm": 0.74609375, + "learning_rate": 6.992903225806452e-05, + "loss": 0.1616, + "step": 19145 + }, + { + "epoch": 0.306336, + "grad_norm": 0.52734375, + "learning_rate": 6.99274193548387e-05, + "loss": 0.1607, + "step": 19146 + }, + { + "epoch": 0.306352, + "grad_norm": 1.265625, + "learning_rate": 6.99258064516129e-05, + "loss": 0.2251, + "step": 19147 + }, + { + "epoch": 0.306368, + "grad_norm": 0.6875, + "learning_rate": 6.992419354838709e-05, + "loss": 0.1628, + "step": 19148 + }, + { + "epoch": 0.306384, + "grad_norm": 0.96484375, + "learning_rate": 6.992258064516129e-05, + "loss": 0.1153, + "step": 19149 + }, + { + "epoch": 0.3064, + "grad_norm": 1.2578125, + "learning_rate": 6.992096774193548e-05, + "loss": 0.1677, + "step": 19150 + }, + { + "epoch": 0.306416, + "grad_norm": 0.71484375, + "learning_rate": 6.991935483870968e-05, + "loss": 0.1888, + "step": 19151 + }, + { + "epoch": 0.306432, + "grad_norm": 0.7265625, + "learning_rate": 6.991774193548388e-05, + "loss": 0.1794, + "step": 19152 + }, + { + "epoch": 0.306448, + "grad_norm": 1.1640625, + "learning_rate": 6.991612903225808e-05, + "loss": 0.1961, + "step": 19153 + }, + { + "epoch": 0.306464, + "grad_norm": 1.5703125, + "learning_rate": 6.991451612903226e-05, + "loss": 0.1977, + "step": 19154 + }, + { + "epoch": 0.30648, + "grad_norm": 0.84765625, + "learning_rate": 6.991290322580646e-05, + "loss": 0.201, + "step": 19155 + }, + { + "epoch": 0.306496, + "grad_norm": 0.9375, + "learning_rate": 6.991129032258065e-05, + "loss": 0.1569, + "step": 19156 + }, + { + "epoch": 0.306512, + "grad_norm": 1.140625, + "learning_rate": 6.990967741935485e-05, + "loss": 0.1888, + "step": 19157 + }, + { + "epoch": 0.306528, + "grad_norm": 0.90234375, + "learning_rate": 6.990806451612903e-05, + "loss": 0.1623, + "step": 19158 + }, + { + "epoch": 0.306544, + "grad_norm": 0.81640625, + "learning_rate": 6.990645161290323e-05, + "loss": 0.1569, + "step": 19159 + }, + { + "epoch": 0.30656, + "grad_norm": 0.78515625, + "learning_rate": 6.990483870967742e-05, + "loss": 0.2101, + "step": 19160 + }, + { + "epoch": 0.306576, + "grad_norm": 0.7890625, + "learning_rate": 6.99032258064516e-05, + "loss": 0.1899, + "step": 19161 + }, + { + "epoch": 0.306592, + "grad_norm": 0.71875, + "learning_rate": 6.99016129032258e-05, + "loss": 0.1817, + "step": 19162 + }, + { + "epoch": 0.306608, + "grad_norm": 0.55078125, + "learning_rate": 6.99e-05, + "loss": 0.1359, + "step": 19163 + }, + { + "epoch": 0.306624, + "grad_norm": 1.15625, + "learning_rate": 6.98983870967742e-05, + "loss": 0.1542, + "step": 19164 + }, + { + "epoch": 0.30664, + "grad_norm": 0.84375, + "learning_rate": 6.989677419354839e-05, + "loss": 0.223, + "step": 19165 + }, + { + "epoch": 0.306656, + "grad_norm": 0.828125, + "learning_rate": 6.989516129032259e-05, + "loss": 0.1632, + "step": 19166 + }, + { + "epoch": 0.306672, + "grad_norm": 1.3515625, + "learning_rate": 6.989354838709678e-05, + "loss": 0.2003, + "step": 19167 + }, + { + "epoch": 0.306688, + "grad_norm": 0.6015625, + "learning_rate": 6.989193548387098e-05, + "loss": 0.1462, + "step": 19168 + }, + { + "epoch": 0.306704, + "grad_norm": 0.69140625, + "learning_rate": 6.989032258064516e-05, + "loss": 0.2084, + "step": 19169 + }, + { + "epoch": 0.30672, + "grad_norm": 0.92578125, + "learning_rate": 6.988870967741936e-05, + "loss": 0.1696, + "step": 19170 + }, + { + "epoch": 0.306736, + "grad_norm": 0.75390625, + "learning_rate": 6.988709677419355e-05, + "loss": 0.191, + "step": 19171 + }, + { + "epoch": 0.306752, + "grad_norm": 0.490234375, + "learning_rate": 6.988548387096775e-05, + "loss": 0.148, + "step": 19172 + }, + { + "epoch": 0.306768, + "grad_norm": 0.79296875, + "learning_rate": 6.988387096774193e-05, + "loss": 0.189, + "step": 19173 + }, + { + "epoch": 0.306784, + "grad_norm": 1.0078125, + "learning_rate": 6.988225806451613e-05, + "loss": 0.187, + "step": 19174 + }, + { + "epoch": 0.3068, + "grad_norm": 0.90625, + "learning_rate": 6.988064516129032e-05, + "loss": 0.1915, + "step": 19175 + }, + { + "epoch": 0.306816, + "grad_norm": 1.6640625, + "learning_rate": 6.987903225806452e-05, + "loss": 0.1563, + "step": 19176 + }, + { + "epoch": 0.306832, + "grad_norm": 0.609375, + "learning_rate": 6.987741935483872e-05, + "loss": 0.174, + "step": 19177 + }, + { + "epoch": 0.306848, + "grad_norm": 0.640625, + "learning_rate": 6.98758064516129e-05, + "loss": 0.1603, + "step": 19178 + }, + { + "epoch": 0.306864, + "grad_norm": 0.66796875, + "learning_rate": 6.98741935483871e-05, + "loss": 0.1683, + "step": 19179 + }, + { + "epoch": 0.30688, + "grad_norm": 0.69140625, + "learning_rate": 6.987258064516129e-05, + "loss": 0.1526, + "step": 19180 + }, + { + "epoch": 0.306896, + "grad_norm": 0.67578125, + "learning_rate": 6.987096774193549e-05, + "loss": 0.1737, + "step": 19181 + }, + { + "epoch": 0.306912, + "grad_norm": 1.2265625, + "learning_rate": 6.986935483870968e-05, + "loss": 0.1704, + "step": 19182 + }, + { + "epoch": 0.306928, + "grad_norm": 0.87890625, + "learning_rate": 6.986774193548388e-05, + "loss": 0.1522, + "step": 19183 + }, + { + "epoch": 0.306944, + "grad_norm": 0.8828125, + "learning_rate": 6.986612903225806e-05, + "loss": 0.1973, + "step": 19184 + }, + { + "epoch": 0.30696, + "grad_norm": 1.6640625, + "learning_rate": 6.986451612903226e-05, + "loss": 0.2092, + "step": 19185 + }, + { + "epoch": 0.306976, + "grad_norm": 0.66015625, + "learning_rate": 6.986290322580645e-05, + "loss": 0.1597, + "step": 19186 + }, + { + "epoch": 0.306992, + "grad_norm": 0.73046875, + "learning_rate": 6.986129032258065e-05, + "loss": 0.1873, + "step": 19187 + }, + { + "epoch": 0.307008, + "grad_norm": 0.859375, + "learning_rate": 6.985967741935485e-05, + "loss": 0.164, + "step": 19188 + }, + { + "epoch": 0.307024, + "grad_norm": 0.828125, + "learning_rate": 6.985806451612905e-05, + "loss": 0.1738, + "step": 19189 + }, + { + "epoch": 0.30704, + "grad_norm": 1.40625, + "learning_rate": 6.985645161290323e-05, + "loss": 0.1266, + "step": 19190 + }, + { + "epoch": 0.307056, + "grad_norm": 0.765625, + "learning_rate": 6.985483870967743e-05, + "loss": 0.1896, + "step": 19191 + }, + { + "epoch": 0.307072, + "grad_norm": 0.5859375, + "learning_rate": 6.985322580645162e-05, + "loss": 0.1802, + "step": 19192 + }, + { + "epoch": 0.307088, + "grad_norm": 0.5703125, + "learning_rate": 6.98516129032258e-05, + "loss": 0.1883, + "step": 19193 + }, + { + "epoch": 0.307104, + "grad_norm": 0.91015625, + "learning_rate": 6.985e-05, + "loss": 0.1855, + "step": 19194 + }, + { + "epoch": 0.30712, + "grad_norm": 0.68359375, + "learning_rate": 6.984838709677419e-05, + "loss": 0.1876, + "step": 19195 + }, + { + "epoch": 0.307136, + "grad_norm": 0.78125, + "learning_rate": 6.984677419354839e-05, + "loss": 0.1967, + "step": 19196 + }, + { + "epoch": 0.307152, + "grad_norm": 0.75390625, + "learning_rate": 6.984516129032258e-05, + "loss": 0.1776, + "step": 19197 + }, + { + "epoch": 0.307168, + "grad_norm": 0.8671875, + "learning_rate": 6.984354838709678e-05, + "loss": 0.2098, + "step": 19198 + }, + { + "epoch": 0.307184, + "grad_norm": 0.7734375, + "learning_rate": 6.984193548387097e-05, + "loss": 0.1613, + "step": 19199 + }, + { + "epoch": 0.3072, + "grad_norm": 0.5546875, + "learning_rate": 6.984032258064517e-05, + "loss": 0.1701, + "step": 19200 + }, + { + "epoch": 0.307216, + "grad_norm": 0.65625, + "learning_rate": 6.983870967741936e-05, + "loss": 0.1698, + "step": 19201 + }, + { + "epoch": 0.307232, + "grad_norm": 0.9375, + "learning_rate": 6.983709677419356e-05, + "loss": 0.1858, + "step": 19202 + }, + { + "epoch": 0.307248, + "grad_norm": 0.609375, + "learning_rate": 6.983548387096775e-05, + "loss": 0.1982, + "step": 19203 + }, + { + "epoch": 0.307264, + "grad_norm": 1.03125, + "learning_rate": 6.983387096774195e-05, + "loss": 0.1774, + "step": 19204 + }, + { + "epoch": 0.30728, + "grad_norm": 0.796875, + "learning_rate": 6.983225806451613e-05, + "loss": 0.1626, + "step": 19205 + }, + { + "epoch": 0.307296, + "grad_norm": 0.69921875, + "learning_rate": 6.983064516129033e-05, + "loss": 0.1679, + "step": 19206 + }, + { + "epoch": 0.307312, + "grad_norm": 0.69140625, + "learning_rate": 6.982903225806452e-05, + "loss": 0.2185, + "step": 19207 + }, + { + "epoch": 0.307328, + "grad_norm": 1.0625, + "learning_rate": 6.98274193548387e-05, + "loss": 0.1856, + "step": 19208 + }, + { + "epoch": 0.307344, + "grad_norm": 0.96484375, + "learning_rate": 6.98258064516129e-05, + "loss": 0.159, + "step": 19209 + }, + { + "epoch": 0.30736, + "grad_norm": 0.54296875, + "learning_rate": 6.982419354838709e-05, + "loss": 0.1654, + "step": 19210 + }, + { + "epoch": 0.307376, + "grad_norm": 1.2421875, + "learning_rate": 6.982258064516129e-05, + "loss": 0.1473, + "step": 19211 + }, + { + "epoch": 0.307392, + "grad_norm": 0.7421875, + "learning_rate": 6.982096774193549e-05, + "loss": 0.1653, + "step": 19212 + }, + { + "epoch": 0.307408, + "grad_norm": 0.796875, + "learning_rate": 6.981935483870969e-05, + "loss": 0.1405, + "step": 19213 + }, + { + "epoch": 0.307424, + "grad_norm": 0.9609375, + "learning_rate": 6.981774193548387e-05, + "loss": 0.1711, + "step": 19214 + }, + { + "epoch": 0.30744, + "grad_norm": 0.96875, + "learning_rate": 6.981612903225807e-05, + "loss": 0.1666, + "step": 19215 + }, + { + "epoch": 0.307456, + "grad_norm": 0.71484375, + "learning_rate": 6.981451612903226e-05, + "loss": 0.149, + "step": 19216 + }, + { + "epoch": 0.307472, + "grad_norm": 0.84375, + "learning_rate": 6.981290322580646e-05, + "loss": 0.2056, + "step": 19217 + }, + { + "epoch": 0.307488, + "grad_norm": 0.90234375, + "learning_rate": 6.981129032258065e-05, + "loss": 0.2139, + "step": 19218 + }, + { + "epoch": 0.307504, + "grad_norm": 0.74609375, + "learning_rate": 6.980967741935485e-05, + "loss": 0.1611, + "step": 19219 + }, + { + "epoch": 0.30752, + "grad_norm": 0.62890625, + "learning_rate": 6.980806451612903e-05, + "loss": 0.1652, + "step": 19220 + }, + { + "epoch": 0.307536, + "grad_norm": 1.5, + "learning_rate": 6.980645161290323e-05, + "loss": 0.1654, + "step": 19221 + }, + { + "epoch": 0.307552, + "grad_norm": 0.96875, + "learning_rate": 6.980483870967742e-05, + "loss": 0.177, + "step": 19222 + }, + { + "epoch": 0.307568, + "grad_norm": 0.99609375, + "learning_rate": 6.980322580645162e-05, + "loss": 0.17, + "step": 19223 + }, + { + "epoch": 0.307584, + "grad_norm": 0.46875, + "learning_rate": 6.980161290322582e-05, + "loss": 0.1605, + "step": 19224 + }, + { + "epoch": 0.3076, + "grad_norm": 0.98828125, + "learning_rate": 6.98e-05, + "loss": 0.1632, + "step": 19225 + }, + { + "epoch": 0.307616, + "grad_norm": 2.0625, + "learning_rate": 6.97983870967742e-05, + "loss": 0.1878, + "step": 19226 + }, + { + "epoch": 0.307632, + "grad_norm": 1.9765625, + "learning_rate": 6.979677419354839e-05, + "loss": 0.1771, + "step": 19227 + }, + { + "epoch": 0.307648, + "grad_norm": 0.66015625, + "learning_rate": 6.979516129032259e-05, + "loss": 0.1891, + "step": 19228 + }, + { + "epoch": 0.307664, + "grad_norm": 0.65234375, + "learning_rate": 6.979354838709677e-05, + "loss": 0.1668, + "step": 19229 + }, + { + "epoch": 0.30768, + "grad_norm": 0.60546875, + "learning_rate": 6.979193548387097e-05, + "loss": 0.1604, + "step": 19230 + }, + { + "epoch": 0.307696, + "grad_norm": 0.6875, + "learning_rate": 6.979032258064516e-05, + "loss": 0.1587, + "step": 19231 + }, + { + "epoch": 0.307712, + "grad_norm": 1.515625, + "learning_rate": 6.978870967741936e-05, + "loss": 0.2134, + "step": 19232 + }, + { + "epoch": 0.307728, + "grad_norm": 0.76953125, + "learning_rate": 6.978709677419355e-05, + "loss": 0.1663, + "step": 19233 + }, + { + "epoch": 0.307744, + "grad_norm": 1.6171875, + "learning_rate": 6.978548387096775e-05, + "loss": 0.1463, + "step": 19234 + }, + { + "epoch": 0.30776, + "grad_norm": 1.2265625, + "learning_rate": 6.978387096774193e-05, + "loss": 0.2058, + "step": 19235 + }, + { + "epoch": 0.307776, + "grad_norm": 1.015625, + "learning_rate": 6.978225806451613e-05, + "loss": 0.1835, + "step": 19236 + }, + { + "epoch": 0.307792, + "grad_norm": 0.71875, + "learning_rate": 6.978064516129033e-05, + "loss": 0.1715, + "step": 19237 + }, + { + "epoch": 0.307808, + "grad_norm": 0.5859375, + "learning_rate": 6.977903225806452e-05, + "loss": 0.1814, + "step": 19238 + }, + { + "epoch": 0.307824, + "grad_norm": 0.71484375, + "learning_rate": 6.977741935483872e-05, + "loss": 0.175, + "step": 19239 + }, + { + "epoch": 0.30784, + "grad_norm": 0.63671875, + "learning_rate": 6.97758064516129e-05, + "loss": 0.1844, + "step": 19240 + }, + { + "epoch": 0.307856, + "grad_norm": 0.60546875, + "learning_rate": 6.97741935483871e-05, + "loss": 0.1639, + "step": 19241 + }, + { + "epoch": 0.307872, + "grad_norm": 0.8828125, + "learning_rate": 6.977258064516129e-05, + "loss": 0.2015, + "step": 19242 + }, + { + "epoch": 0.307888, + "grad_norm": 1.015625, + "learning_rate": 6.977096774193549e-05, + "loss": 0.2138, + "step": 19243 + }, + { + "epoch": 0.307904, + "grad_norm": 0.78515625, + "learning_rate": 6.976935483870967e-05, + "loss": 0.1697, + "step": 19244 + }, + { + "epoch": 0.30792, + "grad_norm": 0.8828125, + "learning_rate": 6.976774193548387e-05, + "loss": 0.195, + "step": 19245 + }, + { + "epoch": 0.307936, + "grad_norm": 0.71875, + "learning_rate": 6.976612903225806e-05, + "loss": 0.1624, + "step": 19246 + }, + { + "epoch": 0.307952, + "grad_norm": 0.5078125, + "learning_rate": 6.976451612903226e-05, + "loss": 0.144, + "step": 19247 + }, + { + "epoch": 0.307968, + "grad_norm": 1.4375, + "learning_rate": 6.976290322580646e-05, + "loss": 0.1712, + "step": 19248 + }, + { + "epoch": 0.307984, + "grad_norm": 0.87109375, + "learning_rate": 6.976129032258066e-05, + "loss": 0.1407, + "step": 19249 + }, + { + "epoch": 0.308, + "grad_norm": 0.94140625, + "learning_rate": 6.975967741935484e-05, + "loss": 0.1954, + "step": 19250 + }, + { + "epoch": 0.308016, + "grad_norm": 0.69140625, + "learning_rate": 6.975806451612904e-05, + "loss": 0.1629, + "step": 19251 + }, + { + "epoch": 0.308032, + "grad_norm": 0.76171875, + "learning_rate": 6.975645161290323e-05, + "loss": 0.1641, + "step": 19252 + }, + { + "epoch": 0.308048, + "grad_norm": 0.93359375, + "learning_rate": 6.975483870967743e-05, + "loss": 0.1292, + "step": 19253 + }, + { + "epoch": 0.308064, + "grad_norm": 0.61328125, + "learning_rate": 6.975322580645162e-05, + "loss": 0.1579, + "step": 19254 + }, + { + "epoch": 0.30808, + "grad_norm": 0.8046875, + "learning_rate": 6.97516129032258e-05, + "loss": 0.2319, + "step": 19255 + }, + { + "epoch": 0.308096, + "grad_norm": 0.671875, + "learning_rate": 6.975e-05, + "loss": 0.1673, + "step": 19256 + }, + { + "epoch": 0.308112, + "grad_norm": 0.73828125, + "learning_rate": 6.974838709677419e-05, + "loss": 0.1132, + "step": 19257 + }, + { + "epoch": 0.308128, + "grad_norm": 0.6796875, + "learning_rate": 6.974677419354839e-05, + "loss": 0.1694, + "step": 19258 + }, + { + "epoch": 0.308144, + "grad_norm": 0.6015625, + "learning_rate": 6.974516129032259e-05, + "loss": 0.1705, + "step": 19259 + }, + { + "epoch": 0.30816, + "grad_norm": 0.83203125, + "learning_rate": 6.974354838709679e-05, + "loss": 0.1866, + "step": 19260 + }, + { + "epoch": 0.308176, + "grad_norm": 0.59765625, + "learning_rate": 6.974193548387097e-05, + "loss": 0.1643, + "step": 19261 + }, + { + "epoch": 0.308192, + "grad_norm": 1.140625, + "learning_rate": 6.974032258064517e-05, + "loss": 0.1729, + "step": 19262 + }, + { + "epoch": 0.308208, + "grad_norm": 0.82421875, + "learning_rate": 6.973870967741936e-05, + "loss": 0.1633, + "step": 19263 + }, + { + "epoch": 0.308224, + "grad_norm": 1.0859375, + "learning_rate": 6.973709677419356e-05, + "loss": 0.1497, + "step": 19264 + }, + { + "epoch": 0.30824, + "grad_norm": 1.03125, + "learning_rate": 6.973548387096774e-05, + "loss": 0.2037, + "step": 19265 + }, + { + "epoch": 0.308256, + "grad_norm": 0.8046875, + "learning_rate": 6.973387096774194e-05, + "loss": 0.1871, + "step": 19266 + }, + { + "epoch": 0.308272, + "grad_norm": 0.78515625, + "learning_rate": 6.973225806451613e-05, + "loss": 0.1386, + "step": 19267 + }, + { + "epoch": 0.308288, + "grad_norm": 0.7421875, + "learning_rate": 6.973064516129033e-05, + "loss": 0.1793, + "step": 19268 + }, + { + "epoch": 0.308304, + "grad_norm": 1.0859375, + "learning_rate": 6.972903225806452e-05, + "loss": 0.1761, + "step": 19269 + }, + { + "epoch": 0.30832, + "grad_norm": 1.0703125, + "learning_rate": 6.97274193548387e-05, + "loss": 0.2403, + "step": 19270 + }, + { + "epoch": 0.308336, + "grad_norm": 0.7890625, + "learning_rate": 6.97258064516129e-05, + "loss": 0.1482, + "step": 19271 + }, + { + "epoch": 0.308352, + "grad_norm": 0.75390625, + "learning_rate": 6.97241935483871e-05, + "loss": 0.1735, + "step": 19272 + }, + { + "epoch": 0.308368, + "grad_norm": 0.67578125, + "learning_rate": 6.97225806451613e-05, + "loss": 0.1422, + "step": 19273 + }, + { + "epoch": 0.308384, + "grad_norm": 0.859375, + "learning_rate": 6.972096774193549e-05, + "loss": 0.204, + "step": 19274 + }, + { + "epoch": 0.3084, + "grad_norm": 1.125, + "learning_rate": 6.971935483870969e-05, + "loss": 0.2036, + "step": 19275 + }, + { + "epoch": 0.308416, + "grad_norm": 0.70703125, + "learning_rate": 6.971774193548387e-05, + "loss": 0.187, + "step": 19276 + }, + { + "epoch": 0.308432, + "grad_norm": 0.87890625, + "learning_rate": 6.971612903225807e-05, + "loss": 0.1737, + "step": 19277 + }, + { + "epoch": 0.308448, + "grad_norm": 0.65625, + "learning_rate": 6.971451612903226e-05, + "loss": 0.1892, + "step": 19278 + }, + { + "epoch": 0.308464, + "grad_norm": 0.9375, + "learning_rate": 6.971290322580646e-05, + "loss": 0.1659, + "step": 19279 + }, + { + "epoch": 0.30848, + "grad_norm": 0.9765625, + "learning_rate": 6.971129032258064e-05, + "loss": 0.1748, + "step": 19280 + }, + { + "epoch": 0.308496, + "grad_norm": 0.75, + "learning_rate": 6.970967741935484e-05, + "loss": 0.1503, + "step": 19281 + }, + { + "epoch": 0.308512, + "grad_norm": 0.65625, + "learning_rate": 6.970806451612903e-05, + "loss": 0.1699, + "step": 19282 + }, + { + "epoch": 0.308528, + "grad_norm": 0.703125, + "learning_rate": 6.970645161290323e-05, + "loss": 0.1507, + "step": 19283 + }, + { + "epoch": 0.308544, + "grad_norm": 0.6875, + "learning_rate": 6.970483870967743e-05, + "loss": 0.1479, + "step": 19284 + }, + { + "epoch": 0.30856, + "grad_norm": 0.73046875, + "learning_rate": 6.970322580645161e-05, + "loss": 0.1725, + "step": 19285 + }, + { + "epoch": 0.308576, + "grad_norm": 0.78515625, + "learning_rate": 6.970161290322581e-05, + "loss": 0.1714, + "step": 19286 + }, + { + "epoch": 0.308592, + "grad_norm": 1.0390625, + "learning_rate": 6.97e-05, + "loss": 0.2191, + "step": 19287 + }, + { + "epoch": 0.308608, + "grad_norm": 1.0, + "learning_rate": 6.96983870967742e-05, + "loss": 0.135, + "step": 19288 + }, + { + "epoch": 0.308624, + "grad_norm": 0.62890625, + "learning_rate": 6.969677419354839e-05, + "loss": 0.1373, + "step": 19289 + }, + { + "epoch": 0.30864, + "grad_norm": 0.66796875, + "learning_rate": 6.969516129032259e-05, + "loss": 0.1701, + "step": 19290 + }, + { + "epoch": 0.308656, + "grad_norm": 1.046875, + "learning_rate": 6.969354838709677e-05, + "loss": 0.1602, + "step": 19291 + }, + { + "epoch": 0.308672, + "grad_norm": 0.83984375, + "learning_rate": 6.969193548387097e-05, + "loss": 0.1578, + "step": 19292 + }, + { + "epoch": 0.308688, + "grad_norm": 0.63671875, + "learning_rate": 6.969032258064516e-05, + "loss": 0.1646, + "step": 19293 + }, + { + "epoch": 0.308704, + "grad_norm": 0.99609375, + "learning_rate": 6.968870967741936e-05, + "loss": 0.2074, + "step": 19294 + }, + { + "epoch": 0.30872, + "grad_norm": 1.2734375, + "learning_rate": 6.968709677419356e-05, + "loss": 0.1867, + "step": 19295 + }, + { + "epoch": 0.308736, + "grad_norm": 0.8515625, + "learning_rate": 6.968548387096774e-05, + "loss": 0.1652, + "step": 19296 + }, + { + "epoch": 0.308752, + "grad_norm": 0.65625, + "learning_rate": 6.968387096774194e-05, + "loss": 0.1606, + "step": 19297 + }, + { + "epoch": 0.308768, + "grad_norm": 0.82421875, + "learning_rate": 6.968225806451614e-05, + "loss": 0.174, + "step": 19298 + }, + { + "epoch": 0.308784, + "grad_norm": 0.6875, + "learning_rate": 6.968064516129033e-05, + "loss": 0.2028, + "step": 19299 + }, + { + "epoch": 0.3088, + "grad_norm": 0.7890625, + "learning_rate": 6.967903225806453e-05, + "loss": 0.1439, + "step": 19300 + }, + { + "epoch": 0.308816, + "grad_norm": 0.640625, + "learning_rate": 6.967741935483871e-05, + "loss": 0.1185, + "step": 19301 + }, + { + "epoch": 0.308832, + "grad_norm": 0.68359375, + "learning_rate": 6.96758064516129e-05, + "loss": 0.2008, + "step": 19302 + }, + { + "epoch": 0.308848, + "grad_norm": 0.8125, + "learning_rate": 6.96741935483871e-05, + "loss": 0.2041, + "step": 19303 + }, + { + "epoch": 0.308864, + "grad_norm": 0.8046875, + "learning_rate": 6.967258064516129e-05, + "loss": 0.169, + "step": 19304 + }, + { + "epoch": 0.30888, + "grad_norm": 0.7890625, + "learning_rate": 6.967096774193549e-05, + "loss": 0.1704, + "step": 19305 + }, + { + "epoch": 0.308896, + "grad_norm": 1.09375, + "learning_rate": 6.966935483870967e-05, + "loss": 0.1751, + "step": 19306 + }, + { + "epoch": 0.308912, + "grad_norm": 0.4609375, + "learning_rate": 6.966774193548387e-05, + "loss": 0.1591, + "step": 19307 + }, + { + "epoch": 0.308928, + "grad_norm": 0.94921875, + "learning_rate": 6.966612903225807e-05, + "loss": 0.1725, + "step": 19308 + }, + { + "epoch": 0.308944, + "grad_norm": 0.58203125, + "learning_rate": 6.966451612903227e-05, + "loss": 0.16, + "step": 19309 + }, + { + "epoch": 0.30896, + "grad_norm": 0.875, + "learning_rate": 6.966290322580646e-05, + "loss": 0.2102, + "step": 19310 + }, + { + "epoch": 0.308976, + "grad_norm": 0.90234375, + "learning_rate": 6.966129032258066e-05, + "loss": 0.1961, + "step": 19311 + }, + { + "epoch": 0.308992, + "grad_norm": 0.65625, + "learning_rate": 6.965967741935484e-05, + "loss": 0.14, + "step": 19312 + }, + { + "epoch": 0.309008, + "grad_norm": 0.8125, + "learning_rate": 6.965806451612904e-05, + "loss": 0.1588, + "step": 19313 + }, + { + "epoch": 0.309024, + "grad_norm": 1.4296875, + "learning_rate": 6.965645161290323e-05, + "loss": 0.1652, + "step": 19314 + }, + { + "epoch": 0.30904, + "grad_norm": 0.625, + "learning_rate": 6.965483870967743e-05, + "loss": 0.1635, + "step": 19315 + }, + { + "epoch": 0.309056, + "grad_norm": 0.890625, + "learning_rate": 6.965322580645161e-05, + "loss": 0.1927, + "step": 19316 + }, + { + "epoch": 0.309072, + "grad_norm": 1.0546875, + "learning_rate": 6.96516129032258e-05, + "loss": 0.1975, + "step": 19317 + }, + { + "epoch": 0.309088, + "grad_norm": 0.55078125, + "learning_rate": 6.965e-05, + "loss": 0.1799, + "step": 19318 + }, + { + "epoch": 0.309104, + "grad_norm": 0.7734375, + "learning_rate": 6.96483870967742e-05, + "loss": 0.1676, + "step": 19319 + }, + { + "epoch": 0.30912, + "grad_norm": 0.9609375, + "learning_rate": 6.96467741935484e-05, + "loss": 0.1565, + "step": 19320 + }, + { + "epoch": 0.309136, + "grad_norm": 0.87890625, + "learning_rate": 6.964516129032258e-05, + "loss": 0.1609, + "step": 19321 + }, + { + "epoch": 0.309152, + "grad_norm": 0.90625, + "learning_rate": 6.964354838709678e-05, + "loss": 0.2018, + "step": 19322 + }, + { + "epoch": 0.309168, + "grad_norm": 0.6953125, + "learning_rate": 6.964193548387097e-05, + "loss": 0.1218, + "step": 19323 + }, + { + "epoch": 0.309184, + "grad_norm": 0.84765625, + "learning_rate": 6.964032258064517e-05, + "loss": 0.16, + "step": 19324 + }, + { + "epoch": 0.3092, + "grad_norm": 0.73828125, + "learning_rate": 6.963870967741936e-05, + "loss": 0.1815, + "step": 19325 + }, + { + "epoch": 0.309216, + "grad_norm": 0.65234375, + "learning_rate": 6.963709677419356e-05, + "loss": 0.1809, + "step": 19326 + }, + { + "epoch": 0.309232, + "grad_norm": 1.140625, + "learning_rate": 6.963548387096774e-05, + "loss": 0.1772, + "step": 19327 + }, + { + "epoch": 0.309248, + "grad_norm": 0.8671875, + "learning_rate": 6.963387096774194e-05, + "loss": 0.2091, + "step": 19328 + }, + { + "epoch": 0.309264, + "grad_norm": 0.74609375, + "learning_rate": 6.963225806451613e-05, + "loss": 0.213, + "step": 19329 + }, + { + "epoch": 0.30928, + "grad_norm": 0.94921875, + "learning_rate": 6.963064516129033e-05, + "loss": 0.1784, + "step": 19330 + }, + { + "epoch": 0.309296, + "grad_norm": 0.96484375, + "learning_rate": 6.962903225806451e-05, + "loss": 0.2127, + "step": 19331 + }, + { + "epoch": 0.309312, + "grad_norm": 0.75, + "learning_rate": 6.962741935483871e-05, + "loss": 0.1439, + "step": 19332 + }, + { + "epoch": 0.309328, + "grad_norm": 0.7265625, + "learning_rate": 6.962580645161291e-05, + "loss": 0.1891, + "step": 19333 + }, + { + "epoch": 0.309344, + "grad_norm": 0.734375, + "learning_rate": 6.96241935483871e-05, + "loss": 0.1781, + "step": 19334 + }, + { + "epoch": 0.30936, + "grad_norm": 1.09375, + "learning_rate": 6.96225806451613e-05, + "loss": 0.1623, + "step": 19335 + }, + { + "epoch": 0.309376, + "grad_norm": 0.625, + "learning_rate": 6.962096774193548e-05, + "loss": 0.1677, + "step": 19336 + }, + { + "epoch": 0.309392, + "grad_norm": 0.78515625, + "learning_rate": 6.961935483870968e-05, + "loss": 0.1831, + "step": 19337 + }, + { + "epoch": 0.309408, + "grad_norm": 0.7578125, + "learning_rate": 6.961774193548387e-05, + "loss": 0.1488, + "step": 19338 + }, + { + "epoch": 0.309424, + "grad_norm": 0.7421875, + "learning_rate": 6.961612903225807e-05, + "loss": 0.1683, + "step": 19339 + }, + { + "epoch": 0.30944, + "grad_norm": 0.58984375, + "learning_rate": 6.961451612903226e-05, + "loss": 0.1583, + "step": 19340 + }, + { + "epoch": 0.309456, + "grad_norm": 0.7109375, + "learning_rate": 6.961290322580645e-05, + "loss": 0.1345, + "step": 19341 + }, + { + "epoch": 0.309472, + "grad_norm": 0.97265625, + "learning_rate": 6.961129032258064e-05, + "loss": 0.1623, + "step": 19342 + }, + { + "epoch": 0.309488, + "grad_norm": 0.6640625, + "learning_rate": 6.960967741935484e-05, + "loss": 0.1629, + "step": 19343 + }, + { + "epoch": 0.309504, + "grad_norm": 0.63671875, + "learning_rate": 6.960806451612904e-05, + "loss": 0.1738, + "step": 19344 + }, + { + "epoch": 0.30952, + "grad_norm": 0.9140625, + "learning_rate": 6.960645161290324e-05, + "loss": 0.1697, + "step": 19345 + }, + { + "epoch": 0.309536, + "grad_norm": 1.5859375, + "learning_rate": 6.960483870967743e-05, + "loss": 0.1696, + "step": 19346 + }, + { + "epoch": 0.309552, + "grad_norm": 0.83203125, + "learning_rate": 6.960322580645161e-05, + "loss": 0.154, + "step": 19347 + }, + { + "epoch": 0.309568, + "grad_norm": 0.64453125, + "learning_rate": 6.960161290322581e-05, + "loss": 0.1712, + "step": 19348 + }, + { + "epoch": 0.309584, + "grad_norm": 0.6953125, + "learning_rate": 6.96e-05, + "loss": 0.1755, + "step": 19349 + }, + { + "epoch": 0.3096, + "grad_norm": 0.609375, + "learning_rate": 6.95983870967742e-05, + "loss": 0.1729, + "step": 19350 + }, + { + "epoch": 0.309616, + "grad_norm": 0.671875, + "learning_rate": 6.959677419354838e-05, + "loss": 0.157, + "step": 19351 + }, + { + "epoch": 0.309632, + "grad_norm": 0.8515625, + "learning_rate": 6.959516129032258e-05, + "loss": 0.1859, + "step": 19352 + }, + { + "epoch": 0.309648, + "grad_norm": 0.80078125, + "learning_rate": 6.959354838709677e-05, + "loss": 0.1624, + "step": 19353 + }, + { + "epoch": 0.309664, + "grad_norm": 0.92578125, + "learning_rate": 6.959193548387097e-05, + "loss": 0.1738, + "step": 19354 + }, + { + "epoch": 0.30968, + "grad_norm": 0.85546875, + "learning_rate": 6.959032258064517e-05, + "loss": 0.1601, + "step": 19355 + }, + { + "epoch": 0.309696, + "grad_norm": 1.1953125, + "learning_rate": 6.958870967741937e-05, + "loss": 0.1465, + "step": 19356 + }, + { + "epoch": 0.309712, + "grad_norm": 0.578125, + "learning_rate": 6.958709677419355e-05, + "loss": 0.1465, + "step": 19357 + }, + { + "epoch": 0.309728, + "grad_norm": 0.79296875, + "learning_rate": 6.958548387096775e-05, + "loss": 0.1991, + "step": 19358 + }, + { + "epoch": 0.309744, + "grad_norm": 0.58984375, + "learning_rate": 6.958387096774194e-05, + "loss": 0.1692, + "step": 19359 + }, + { + "epoch": 0.30976, + "grad_norm": 0.81640625, + "learning_rate": 6.958225806451614e-05, + "loss": 0.1744, + "step": 19360 + }, + { + "epoch": 0.309776, + "grad_norm": 0.765625, + "learning_rate": 6.958064516129033e-05, + "loss": 0.1592, + "step": 19361 + }, + { + "epoch": 0.309792, + "grad_norm": 0.82421875, + "learning_rate": 6.957903225806453e-05, + "loss": 0.1562, + "step": 19362 + }, + { + "epoch": 0.309808, + "grad_norm": 0.5703125, + "learning_rate": 6.957741935483871e-05, + "loss": 0.164, + "step": 19363 + }, + { + "epoch": 0.309824, + "grad_norm": 0.75390625, + "learning_rate": 6.95758064516129e-05, + "loss": 0.1748, + "step": 19364 + }, + { + "epoch": 0.30984, + "grad_norm": 0.56640625, + "learning_rate": 6.95741935483871e-05, + "loss": 0.2158, + "step": 19365 + }, + { + "epoch": 0.309856, + "grad_norm": 0.83203125, + "learning_rate": 6.957258064516128e-05, + "loss": 0.1635, + "step": 19366 + }, + { + "epoch": 0.309872, + "grad_norm": 0.8203125, + "learning_rate": 6.957096774193548e-05, + "loss": 0.1664, + "step": 19367 + }, + { + "epoch": 0.309888, + "grad_norm": 0.92578125, + "learning_rate": 6.956935483870968e-05, + "loss": 0.2025, + "step": 19368 + }, + { + "epoch": 0.309904, + "grad_norm": 0.8046875, + "learning_rate": 6.956774193548388e-05, + "loss": 0.1461, + "step": 19369 + }, + { + "epoch": 0.30992, + "grad_norm": 1.2421875, + "learning_rate": 6.956612903225807e-05, + "loss": 0.1809, + "step": 19370 + }, + { + "epoch": 0.309936, + "grad_norm": 0.76953125, + "learning_rate": 6.956451612903227e-05, + "loss": 0.1796, + "step": 19371 + }, + { + "epoch": 0.309952, + "grad_norm": 1.34375, + "learning_rate": 6.956290322580645e-05, + "loss": 0.2089, + "step": 19372 + }, + { + "epoch": 0.309968, + "grad_norm": 0.95703125, + "learning_rate": 6.956129032258065e-05, + "loss": 0.2313, + "step": 19373 + }, + { + "epoch": 0.309984, + "grad_norm": 0.80859375, + "learning_rate": 6.955967741935484e-05, + "loss": 0.1981, + "step": 19374 + }, + { + "epoch": 0.31, + "grad_norm": 0.78515625, + "learning_rate": 6.955806451612904e-05, + "loss": 0.1431, + "step": 19375 + }, + { + "epoch": 0.310016, + "grad_norm": 0.75390625, + "learning_rate": 6.955645161290323e-05, + "loss": 0.2087, + "step": 19376 + }, + { + "epoch": 0.310032, + "grad_norm": 0.70703125, + "learning_rate": 6.955483870967742e-05, + "loss": 0.1625, + "step": 19377 + }, + { + "epoch": 0.310048, + "grad_norm": 0.55859375, + "learning_rate": 6.955322580645161e-05, + "loss": 0.1723, + "step": 19378 + }, + { + "epoch": 0.310064, + "grad_norm": 0.65625, + "learning_rate": 6.955161290322581e-05, + "loss": 0.1435, + "step": 19379 + }, + { + "epoch": 0.31008, + "grad_norm": 0.5234375, + "learning_rate": 6.955000000000001e-05, + "loss": 0.1346, + "step": 19380 + }, + { + "epoch": 0.310096, + "grad_norm": 0.65625, + "learning_rate": 6.95483870967742e-05, + "loss": 0.1591, + "step": 19381 + }, + { + "epoch": 0.310112, + "grad_norm": 1.125, + "learning_rate": 6.95467741935484e-05, + "loss": 0.2151, + "step": 19382 + }, + { + "epoch": 0.310128, + "grad_norm": 0.703125, + "learning_rate": 6.954516129032258e-05, + "loss": 0.1906, + "step": 19383 + }, + { + "epoch": 0.310144, + "grad_norm": 0.5078125, + "learning_rate": 6.954354838709678e-05, + "loss": 0.1903, + "step": 19384 + }, + { + "epoch": 0.31016, + "grad_norm": 0.6875, + "learning_rate": 6.954193548387097e-05, + "loss": 0.1765, + "step": 19385 + }, + { + "epoch": 0.310176, + "grad_norm": 0.8203125, + "learning_rate": 6.954032258064517e-05, + "loss": 0.1683, + "step": 19386 + }, + { + "epoch": 0.310192, + "grad_norm": 0.734375, + "learning_rate": 6.953870967741935e-05, + "loss": 0.1654, + "step": 19387 + }, + { + "epoch": 0.310208, + "grad_norm": 0.87109375, + "learning_rate": 6.953709677419355e-05, + "loss": 0.1609, + "step": 19388 + }, + { + "epoch": 0.310224, + "grad_norm": 0.6875, + "learning_rate": 6.953548387096774e-05, + "loss": 0.1608, + "step": 19389 + }, + { + "epoch": 0.31024, + "grad_norm": 0.60546875, + "learning_rate": 6.953387096774194e-05, + "loss": 0.1448, + "step": 19390 + }, + { + "epoch": 0.310256, + "grad_norm": 0.80078125, + "learning_rate": 6.953225806451614e-05, + "loss": 0.1933, + "step": 19391 + }, + { + "epoch": 0.310272, + "grad_norm": 0.82421875, + "learning_rate": 6.953064516129032e-05, + "loss": 0.1651, + "step": 19392 + }, + { + "epoch": 0.310288, + "grad_norm": 0.765625, + "learning_rate": 6.952903225806452e-05, + "loss": 0.1808, + "step": 19393 + }, + { + "epoch": 0.310304, + "grad_norm": 0.96875, + "learning_rate": 6.952741935483871e-05, + "loss": 0.1337, + "step": 19394 + }, + { + "epoch": 0.31032, + "grad_norm": 0.890625, + "learning_rate": 6.952580645161291e-05, + "loss": 0.1912, + "step": 19395 + }, + { + "epoch": 0.310336, + "grad_norm": 0.69921875, + "learning_rate": 6.95241935483871e-05, + "loss": 0.1528, + "step": 19396 + }, + { + "epoch": 0.310352, + "grad_norm": 1.015625, + "learning_rate": 6.95225806451613e-05, + "loss": 0.1743, + "step": 19397 + }, + { + "epoch": 0.310368, + "grad_norm": 1.0078125, + "learning_rate": 6.952096774193548e-05, + "loss": 0.2002, + "step": 19398 + }, + { + "epoch": 0.310384, + "grad_norm": 0.80078125, + "learning_rate": 6.951935483870968e-05, + "loss": 0.1691, + "step": 19399 + }, + { + "epoch": 0.3104, + "grad_norm": 0.55078125, + "learning_rate": 6.951774193548387e-05, + "loss": 0.1311, + "step": 19400 + }, + { + "epoch": 0.310416, + "grad_norm": 0.76953125, + "learning_rate": 6.951612903225807e-05, + "loss": 0.171, + "step": 19401 + }, + { + "epoch": 0.310432, + "grad_norm": 0.94921875, + "learning_rate": 6.951451612903225e-05, + "loss": 0.1893, + "step": 19402 + }, + { + "epoch": 0.310448, + "grad_norm": 0.703125, + "learning_rate": 6.951290322580645e-05, + "loss": 0.1883, + "step": 19403 + }, + { + "epoch": 0.310464, + "grad_norm": 0.6953125, + "learning_rate": 6.951129032258065e-05, + "loss": 0.1724, + "step": 19404 + }, + { + "epoch": 0.31048, + "grad_norm": 0.96484375, + "learning_rate": 6.950967741935485e-05, + "loss": 0.1748, + "step": 19405 + }, + { + "epoch": 0.310496, + "grad_norm": 1.1328125, + "learning_rate": 6.950806451612904e-05, + "loss": 0.2063, + "step": 19406 + }, + { + "epoch": 0.310512, + "grad_norm": 0.62890625, + "learning_rate": 6.950645161290324e-05, + "loss": 0.1648, + "step": 19407 + }, + { + "epoch": 0.310528, + "grad_norm": 0.6875, + "learning_rate": 6.950483870967742e-05, + "loss": 0.2224, + "step": 19408 + }, + { + "epoch": 0.310544, + "grad_norm": 0.71875, + "learning_rate": 6.950322580645162e-05, + "loss": 0.1698, + "step": 19409 + }, + { + "epoch": 0.31056, + "grad_norm": 0.984375, + "learning_rate": 6.950161290322581e-05, + "loss": 0.2197, + "step": 19410 + }, + { + "epoch": 0.310576, + "grad_norm": 0.7890625, + "learning_rate": 6.95e-05, + "loss": 0.1373, + "step": 19411 + }, + { + "epoch": 0.310592, + "grad_norm": 0.55078125, + "learning_rate": 6.94983870967742e-05, + "loss": 0.1471, + "step": 19412 + }, + { + "epoch": 0.310608, + "grad_norm": 0.55078125, + "learning_rate": 6.949677419354838e-05, + "loss": 0.1291, + "step": 19413 + }, + { + "epoch": 0.310624, + "grad_norm": 0.69140625, + "learning_rate": 6.949516129032258e-05, + "loss": 0.1518, + "step": 19414 + }, + { + "epoch": 0.31064, + "grad_norm": 0.95703125, + "learning_rate": 6.949354838709678e-05, + "loss": 0.1713, + "step": 19415 + }, + { + "epoch": 0.310656, + "grad_norm": 0.75, + "learning_rate": 6.949193548387098e-05, + "loss": 0.1635, + "step": 19416 + }, + { + "epoch": 0.310672, + "grad_norm": 0.75390625, + "learning_rate": 6.949032258064517e-05, + "loss": 0.1709, + "step": 19417 + }, + { + "epoch": 0.310688, + "grad_norm": 1.1953125, + "learning_rate": 6.948870967741937e-05, + "loss": 0.1534, + "step": 19418 + }, + { + "epoch": 0.310704, + "grad_norm": 0.8515625, + "learning_rate": 6.948709677419355e-05, + "loss": 0.1904, + "step": 19419 + }, + { + "epoch": 0.31072, + "grad_norm": 0.96484375, + "learning_rate": 6.948548387096775e-05, + "loss": 0.1678, + "step": 19420 + }, + { + "epoch": 0.310736, + "grad_norm": 0.61328125, + "learning_rate": 6.948387096774194e-05, + "loss": 0.1456, + "step": 19421 + }, + { + "epoch": 0.310752, + "grad_norm": 1.8125, + "learning_rate": 6.948225806451614e-05, + "loss": 0.1281, + "step": 19422 + }, + { + "epoch": 0.310768, + "grad_norm": 0.68359375, + "learning_rate": 6.948064516129032e-05, + "loss": 0.1707, + "step": 19423 + }, + { + "epoch": 0.310784, + "grad_norm": 0.6640625, + "learning_rate": 6.947903225806452e-05, + "loss": 0.136, + "step": 19424 + }, + { + "epoch": 0.3108, + "grad_norm": 1.3046875, + "learning_rate": 6.947741935483871e-05, + "loss": 0.1819, + "step": 19425 + }, + { + "epoch": 0.310816, + "grad_norm": 0.62109375, + "learning_rate": 6.94758064516129e-05, + "loss": 0.1714, + "step": 19426 + }, + { + "epoch": 0.310832, + "grad_norm": 0.6875, + "learning_rate": 6.94741935483871e-05, + "loss": 0.1732, + "step": 19427 + }, + { + "epoch": 0.310848, + "grad_norm": 1.2734375, + "learning_rate": 6.94725806451613e-05, + "loss": 0.2058, + "step": 19428 + }, + { + "epoch": 0.310864, + "grad_norm": 0.95703125, + "learning_rate": 6.94709677419355e-05, + "loss": 0.1667, + "step": 19429 + }, + { + "epoch": 0.31088, + "grad_norm": 0.83203125, + "learning_rate": 6.946935483870968e-05, + "loss": 0.1384, + "step": 19430 + }, + { + "epoch": 0.310896, + "grad_norm": 0.97265625, + "learning_rate": 6.946774193548388e-05, + "loss": 0.1724, + "step": 19431 + }, + { + "epoch": 0.310912, + "grad_norm": 1.140625, + "learning_rate": 6.946612903225807e-05, + "loss": 0.1963, + "step": 19432 + }, + { + "epoch": 0.310928, + "grad_norm": 0.515625, + "learning_rate": 6.946451612903227e-05, + "loss": 0.1568, + "step": 19433 + }, + { + "epoch": 0.310944, + "grad_norm": 0.53515625, + "learning_rate": 6.946290322580645e-05, + "loss": 0.1798, + "step": 19434 + }, + { + "epoch": 0.31096, + "grad_norm": 0.6328125, + "learning_rate": 6.946129032258065e-05, + "loss": 0.1937, + "step": 19435 + }, + { + "epoch": 0.310976, + "grad_norm": 1.1015625, + "learning_rate": 6.945967741935484e-05, + "loss": 0.1808, + "step": 19436 + }, + { + "epoch": 0.310992, + "grad_norm": 0.89453125, + "learning_rate": 6.945806451612904e-05, + "loss": 0.1152, + "step": 19437 + }, + { + "epoch": 0.311008, + "grad_norm": 0.765625, + "learning_rate": 6.945645161290322e-05, + "loss": 0.1957, + "step": 19438 + }, + { + "epoch": 0.311024, + "grad_norm": 0.70703125, + "learning_rate": 6.945483870967742e-05, + "loss": 0.1767, + "step": 19439 + }, + { + "epoch": 0.31104, + "grad_norm": 0.671875, + "learning_rate": 6.945322580645162e-05, + "loss": 0.1522, + "step": 19440 + }, + { + "epoch": 0.311056, + "grad_norm": 0.671875, + "learning_rate": 6.945161290322581e-05, + "loss": 0.2026, + "step": 19441 + }, + { + "epoch": 0.311072, + "grad_norm": 0.53515625, + "learning_rate": 6.945000000000001e-05, + "loss": 0.142, + "step": 19442 + }, + { + "epoch": 0.311088, + "grad_norm": 0.578125, + "learning_rate": 6.94483870967742e-05, + "loss": 0.1812, + "step": 19443 + }, + { + "epoch": 0.311104, + "grad_norm": 0.73828125, + "learning_rate": 6.94467741935484e-05, + "loss": 0.1821, + "step": 19444 + }, + { + "epoch": 0.31112, + "grad_norm": 0.890625, + "learning_rate": 6.944516129032258e-05, + "loss": 0.2182, + "step": 19445 + }, + { + "epoch": 0.311136, + "grad_norm": 0.77734375, + "learning_rate": 6.944354838709678e-05, + "loss": 0.1445, + "step": 19446 + }, + { + "epoch": 0.311152, + "grad_norm": 1.390625, + "learning_rate": 6.944193548387097e-05, + "loss": 0.1714, + "step": 19447 + }, + { + "epoch": 0.311168, + "grad_norm": 0.75390625, + "learning_rate": 6.944032258064516e-05, + "loss": 0.1182, + "step": 19448 + }, + { + "epoch": 0.311184, + "grad_norm": 0.5859375, + "learning_rate": 6.943870967741935e-05, + "loss": 0.1724, + "step": 19449 + }, + { + "epoch": 0.3112, + "grad_norm": 0.5703125, + "learning_rate": 6.943709677419355e-05, + "loss": 0.1325, + "step": 19450 + }, + { + "epoch": 0.311216, + "grad_norm": 0.8984375, + "learning_rate": 6.943548387096775e-05, + "loss": 0.1823, + "step": 19451 + }, + { + "epoch": 0.311232, + "grad_norm": 0.703125, + "learning_rate": 6.943387096774195e-05, + "loss": 0.1601, + "step": 19452 + }, + { + "epoch": 0.311248, + "grad_norm": 0.828125, + "learning_rate": 6.943225806451614e-05, + "loss": 0.19, + "step": 19453 + }, + { + "epoch": 0.311264, + "grad_norm": 0.58203125, + "learning_rate": 6.943064516129034e-05, + "loss": 0.1511, + "step": 19454 + }, + { + "epoch": 0.31128, + "grad_norm": 0.7890625, + "learning_rate": 6.942903225806452e-05, + "loss": 0.1762, + "step": 19455 + }, + { + "epoch": 0.311296, + "grad_norm": 0.671875, + "learning_rate": 6.942741935483871e-05, + "loss": 0.1286, + "step": 19456 + }, + { + "epoch": 0.311312, + "grad_norm": 1.03125, + "learning_rate": 6.942580645161291e-05, + "loss": 0.1742, + "step": 19457 + }, + { + "epoch": 0.311328, + "grad_norm": 0.7265625, + "learning_rate": 6.94241935483871e-05, + "loss": 0.1861, + "step": 19458 + }, + { + "epoch": 0.311344, + "grad_norm": 0.8359375, + "learning_rate": 6.942258064516129e-05, + "loss": 0.1663, + "step": 19459 + }, + { + "epoch": 0.31136, + "grad_norm": 0.546875, + "learning_rate": 6.942096774193548e-05, + "loss": 0.1337, + "step": 19460 + }, + { + "epoch": 0.311376, + "grad_norm": 1.1953125, + "learning_rate": 6.941935483870968e-05, + "loss": 0.1446, + "step": 19461 + }, + { + "epoch": 0.311392, + "grad_norm": 0.79296875, + "learning_rate": 6.941774193548386e-05, + "loss": 0.1603, + "step": 19462 + }, + { + "epoch": 0.311408, + "grad_norm": 0.97265625, + "learning_rate": 6.941612903225806e-05, + "loss": 0.2012, + "step": 19463 + }, + { + "epoch": 0.311424, + "grad_norm": 0.7109375, + "learning_rate": 6.941451612903226e-05, + "loss": 0.1984, + "step": 19464 + }, + { + "epoch": 0.31144, + "grad_norm": 0.63671875, + "learning_rate": 6.941290322580646e-05, + "loss": 0.1976, + "step": 19465 + }, + { + "epoch": 0.311456, + "grad_norm": 0.5390625, + "learning_rate": 6.941129032258065e-05, + "loss": 0.1364, + "step": 19466 + }, + { + "epoch": 0.311472, + "grad_norm": 0.78515625, + "learning_rate": 6.940967741935485e-05, + "loss": 0.1647, + "step": 19467 + }, + { + "epoch": 0.311488, + "grad_norm": 1.3828125, + "learning_rate": 6.940806451612904e-05, + "loss": 0.1786, + "step": 19468 + }, + { + "epoch": 0.311504, + "grad_norm": 0.671875, + "learning_rate": 6.940645161290323e-05, + "loss": 0.1285, + "step": 19469 + }, + { + "epoch": 0.31152, + "grad_norm": 0.60546875, + "learning_rate": 6.940483870967742e-05, + "loss": 0.1603, + "step": 19470 + }, + { + "epoch": 0.311536, + "grad_norm": 0.7109375, + "learning_rate": 6.940322580645162e-05, + "loss": 0.195, + "step": 19471 + }, + { + "epoch": 0.311552, + "grad_norm": 1.0234375, + "learning_rate": 6.94016129032258e-05, + "loss": 0.171, + "step": 19472 + }, + { + "epoch": 0.311568, + "grad_norm": 0.71484375, + "learning_rate": 6.939999999999999e-05, + "loss": 0.1692, + "step": 19473 + }, + { + "epoch": 0.311584, + "grad_norm": 0.58203125, + "learning_rate": 6.939838709677419e-05, + "loss": 0.1673, + "step": 19474 + }, + { + "epoch": 0.3116, + "grad_norm": 0.65625, + "learning_rate": 6.939677419354839e-05, + "loss": 0.1555, + "step": 19475 + }, + { + "epoch": 0.311616, + "grad_norm": 0.8828125, + "learning_rate": 6.939516129032259e-05, + "loss": 0.1486, + "step": 19476 + }, + { + "epoch": 0.311632, + "grad_norm": 0.8515625, + "learning_rate": 6.939354838709678e-05, + "loss": 0.129, + "step": 19477 + }, + { + "epoch": 0.311648, + "grad_norm": 0.6640625, + "learning_rate": 6.939193548387098e-05, + "loss": 0.1673, + "step": 19478 + }, + { + "epoch": 0.311664, + "grad_norm": 0.60546875, + "learning_rate": 6.939032258064516e-05, + "loss": 0.1773, + "step": 19479 + }, + { + "epoch": 0.31168, + "grad_norm": 0.64453125, + "learning_rate": 6.938870967741936e-05, + "loss": 0.1519, + "step": 19480 + }, + { + "epoch": 0.311696, + "grad_norm": 0.6328125, + "learning_rate": 6.938709677419355e-05, + "loss": 0.1828, + "step": 19481 + }, + { + "epoch": 0.311712, + "grad_norm": 1.1015625, + "learning_rate": 6.938548387096775e-05, + "loss": 0.144, + "step": 19482 + }, + { + "epoch": 0.311728, + "grad_norm": 0.640625, + "learning_rate": 6.938387096774193e-05, + "loss": 0.2133, + "step": 19483 + }, + { + "epoch": 0.311744, + "grad_norm": 0.71875, + "learning_rate": 6.938225806451613e-05, + "loss": 0.1552, + "step": 19484 + }, + { + "epoch": 0.31176, + "grad_norm": 0.546875, + "learning_rate": 6.938064516129032e-05, + "loss": 0.1587, + "step": 19485 + }, + { + "epoch": 0.311776, + "grad_norm": 0.52734375, + "learning_rate": 6.937903225806452e-05, + "loss": 0.1795, + "step": 19486 + }, + { + "epoch": 0.311792, + "grad_norm": 0.78515625, + "learning_rate": 6.93774193548387e-05, + "loss": 0.1853, + "step": 19487 + }, + { + "epoch": 0.311808, + "grad_norm": 0.85546875, + "learning_rate": 6.93758064516129e-05, + "loss": 0.147, + "step": 19488 + }, + { + "epoch": 0.311824, + "grad_norm": 0.734375, + "learning_rate": 6.93741935483871e-05, + "loss": 0.1865, + "step": 19489 + }, + { + "epoch": 0.31184, + "grad_norm": 0.9375, + "learning_rate": 6.937258064516129e-05, + "loss": 0.1823, + "step": 19490 + }, + { + "epoch": 0.311856, + "grad_norm": 0.84765625, + "learning_rate": 6.937096774193549e-05, + "loss": 0.1913, + "step": 19491 + }, + { + "epoch": 0.311872, + "grad_norm": 0.8671875, + "learning_rate": 6.936935483870968e-05, + "loss": 0.2302, + "step": 19492 + }, + { + "epoch": 0.311888, + "grad_norm": 1.015625, + "learning_rate": 6.936774193548388e-05, + "loss": 0.1657, + "step": 19493 + }, + { + "epoch": 0.311904, + "grad_norm": 0.640625, + "learning_rate": 6.936612903225806e-05, + "loss": 0.147, + "step": 19494 + }, + { + "epoch": 0.31192, + "grad_norm": 0.76953125, + "learning_rate": 6.936451612903226e-05, + "loss": 0.1792, + "step": 19495 + }, + { + "epoch": 0.311936, + "grad_norm": 0.5234375, + "learning_rate": 6.936290322580645e-05, + "loss": 0.1423, + "step": 19496 + }, + { + "epoch": 0.311952, + "grad_norm": 0.83203125, + "learning_rate": 6.936129032258065e-05, + "loss": 0.1769, + "step": 19497 + }, + { + "epoch": 0.311968, + "grad_norm": 1.078125, + "learning_rate": 6.935967741935483e-05, + "loss": 0.1502, + "step": 19498 + }, + { + "epoch": 0.311984, + "grad_norm": 1.1953125, + "learning_rate": 6.935806451612903e-05, + "loss": 0.1645, + "step": 19499 + }, + { + "epoch": 0.312, + "grad_norm": 0.69921875, + "learning_rate": 6.935645161290323e-05, + "loss": 0.1224, + "step": 19500 + }, + { + "epoch": 0.312016, + "grad_norm": 0.69921875, + "learning_rate": 6.935483870967743e-05, + "loss": 0.1472, + "step": 19501 + }, + { + "epoch": 0.312032, + "grad_norm": 1.1015625, + "learning_rate": 6.935322580645162e-05, + "loss": 0.188, + "step": 19502 + }, + { + "epoch": 0.312048, + "grad_norm": 0.7421875, + "learning_rate": 6.93516129032258e-05, + "loss": 0.1841, + "step": 19503 + }, + { + "epoch": 0.312064, + "grad_norm": 0.8203125, + "learning_rate": 6.935e-05, + "loss": 0.1428, + "step": 19504 + }, + { + "epoch": 0.31208, + "grad_norm": 0.6171875, + "learning_rate": 6.934838709677419e-05, + "loss": 0.1664, + "step": 19505 + }, + { + "epoch": 0.312096, + "grad_norm": 0.8046875, + "learning_rate": 6.934677419354839e-05, + "loss": 0.1547, + "step": 19506 + }, + { + "epoch": 0.312112, + "grad_norm": 0.83984375, + "learning_rate": 6.934516129032258e-05, + "loss": 0.1918, + "step": 19507 + }, + { + "epoch": 0.312128, + "grad_norm": 0.64453125, + "learning_rate": 6.934354838709678e-05, + "loss": 0.1458, + "step": 19508 + }, + { + "epoch": 0.312144, + "grad_norm": 0.78515625, + "learning_rate": 6.934193548387096e-05, + "loss": 0.1498, + "step": 19509 + }, + { + "epoch": 0.31216, + "grad_norm": 0.6875, + "learning_rate": 6.934032258064516e-05, + "loss": 0.1476, + "step": 19510 + }, + { + "epoch": 0.312176, + "grad_norm": 0.6328125, + "learning_rate": 6.933870967741936e-05, + "loss": 0.1942, + "step": 19511 + }, + { + "epoch": 0.312192, + "grad_norm": 0.87109375, + "learning_rate": 6.933709677419356e-05, + "loss": 0.1588, + "step": 19512 + }, + { + "epoch": 0.312208, + "grad_norm": 0.70703125, + "learning_rate": 6.933548387096775e-05, + "loss": 0.172, + "step": 19513 + }, + { + "epoch": 0.312224, + "grad_norm": 0.69140625, + "learning_rate": 6.933387096774195e-05, + "loss": 0.1467, + "step": 19514 + }, + { + "epoch": 0.31224, + "grad_norm": 0.71484375, + "learning_rate": 6.933225806451613e-05, + "loss": 0.1846, + "step": 19515 + }, + { + "epoch": 0.312256, + "grad_norm": 0.890625, + "learning_rate": 6.933064516129033e-05, + "loss": 0.1585, + "step": 19516 + }, + { + "epoch": 0.312272, + "grad_norm": 0.984375, + "learning_rate": 6.932903225806452e-05, + "loss": 0.2016, + "step": 19517 + }, + { + "epoch": 0.312288, + "grad_norm": 1.28125, + "learning_rate": 6.932741935483872e-05, + "loss": 0.2018, + "step": 19518 + }, + { + "epoch": 0.312304, + "grad_norm": 0.921875, + "learning_rate": 6.93258064516129e-05, + "loss": 0.1842, + "step": 19519 + }, + { + "epoch": 0.31232, + "grad_norm": 0.88671875, + "learning_rate": 6.932419354838709e-05, + "loss": 0.1595, + "step": 19520 + }, + { + "epoch": 0.312336, + "grad_norm": 0.91015625, + "learning_rate": 6.932258064516129e-05, + "loss": 0.2185, + "step": 19521 + }, + { + "epoch": 0.312352, + "grad_norm": 0.91015625, + "learning_rate": 6.932096774193548e-05, + "loss": 0.1965, + "step": 19522 + }, + { + "epoch": 0.312368, + "grad_norm": 0.625, + "learning_rate": 6.931935483870968e-05, + "loss": 0.1573, + "step": 19523 + }, + { + "epoch": 0.312384, + "grad_norm": 0.71875, + "learning_rate": 6.931774193548388e-05, + "loss": 0.1691, + "step": 19524 + }, + { + "epoch": 0.3124, + "grad_norm": 0.69140625, + "learning_rate": 6.931612903225808e-05, + "loss": 0.1482, + "step": 19525 + }, + { + "epoch": 0.312416, + "grad_norm": 0.62109375, + "learning_rate": 6.931451612903226e-05, + "loss": 0.123, + "step": 19526 + }, + { + "epoch": 0.312432, + "grad_norm": 0.60546875, + "learning_rate": 6.931290322580646e-05, + "loss": 0.1435, + "step": 19527 + }, + { + "epoch": 0.312448, + "grad_norm": 1.0859375, + "learning_rate": 6.931129032258065e-05, + "loss": 0.2008, + "step": 19528 + }, + { + "epoch": 0.312464, + "grad_norm": 1.1640625, + "learning_rate": 6.930967741935485e-05, + "loss": 0.1809, + "step": 19529 + }, + { + "epoch": 0.31248, + "grad_norm": 0.53515625, + "learning_rate": 6.930806451612903e-05, + "loss": 0.1528, + "step": 19530 + }, + { + "epoch": 0.312496, + "grad_norm": 0.69921875, + "learning_rate": 6.930645161290323e-05, + "loss": 0.1578, + "step": 19531 + }, + { + "epoch": 0.312512, + "grad_norm": 0.85546875, + "learning_rate": 6.930483870967742e-05, + "loss": 0.1886, + "step": 19532 + }, + { + "epoch": 0.312528, + "grad_norm": 0.80859375, + "learning_rate": 6.930322580645162e-05, + "loss": 0.1688, + "step": 19533 + }, + { + "epoch": 0.312544, + "grad_norm": 1.1796875, + "learning_rate": 6.93016129032258e-05, + "loss": 0.1819, + "step": 19534 + }, + { + "epoch": 0.31256, + "grad_norm": 1.1171875, + "learning_rate": 6.93e-05, + "loss": 0.1387, + "step": 19535 + }, + { + "epoch": 0.312576, + "grad_norm": 0.82421875, + "learning_rate": 6.92983870967742e-05, + "loss": 0.1929, + "step": 19536 + }, + { + "epoch": 0.312592, + "grad_norm": 0.99609375, + "learning_rate": 6.929677419354839e-05, + "loss": 0.1903, + "step": 19537 + }, + { + "epoch": 0.312608, + "grad_norm": 0.6875, + "learning_rate": 6.929516129032259e-05, + "loss": 0.1744, + "step": 19538 + }, + { + "epoch": 0.312624, + "grad_norm": 0.859375, + "learning_rate": 6.929354838709678e-05, + "loss": 0.2181, + "step": 19539 + }, + { + "epoch": 0.31264, + "grad_norm": 0.76171875, + "learning_rate": 6.929193548387098e-05, + "loss": 0.1759, + "step": 19540 + }, + { + "epoch": 0.312656, + "grad_norm": 0.625, + "learning_rate": 6.929032258064516e-05, + "loss": 0.1778, + "step": 19541 + }, + { + "epoch": 0.312672, + "grad_norm": 0.63671875, + "learning_rate": 6.928870967741936e-05, + "loss": 0.1635, + "step": 19542 + }, + { + "epoch": 0.312688, + "grad_norm": 0.66015625, + "learning_rate": 6.928709677419355e-05, + "loss": 0.1591, + "step": 19543 + }, + { + "epoch": 0.312704, + "grad_norm": 0.58203125, + "learning_rate": 6.928548387096775e-05, + "loss": 0.144, + "step": 19544 + }, + { + "epoch": 0.31272, + "grad_norm": 0.859375, + "learning_rate": 6.928387096774193e-05, + "loss": 0.1937, + "step": 19545 + }, + { + "epoch": 0.312736, + "grad_norm": 1.3828125, + "learning_rate": 6.928225806451613e-05, + "loss": 0.1844, + "step": 19546 + }, + { + "epoch": 0.312752, + "grad_norm": 0.62890625, + "learning_rate": 6.928064516129033e-05, + "loss": 0.1673, + "step": 19547 + }, + { + "epoch": 0.312768, + "grad_norm": 0.6796875, + "learning_rate": 6.927903225806452e-05, + "loss": 0.1591, + "step": 19548 + }, + { + "epoch": 0.312784, + "grad_norm": 1.0546875, + "learning_rate": 6.927741935483872e-05, + "loss": 0.2005, + "step": 19549 + }, + { + "epoch": 0.3128, + "grad_norm": 1.21875, + "learning_rate": 6.92758064516129e-05, + "loss": 0.1618, + "step": 19550 + }, + { + "epoch": 0.312816, + "grad_norm": 0.7265625, + "learning_rate": 6.92741935483871e-05, + "loss": 0.1504, + "step": 19551 + }, + { + "epoch": 0.312832, + "grad_norm": 0.62109375, + "learning_rate": 6.927258064516129e-05, + "loss": 0.1732, + "step": 19552 + }, + { + "epoch": 0.312848, + "grad_norm": 0.96875, + "learning_rate": 6.927096774193549e-05, + "loss": 0.2161, + "step": 19553 + }, + { + "epoch": 0.312864, + "grad_norm": 0.88671875, + "learning_rate": 6.926935483870967e-05, + "loss": 0.1947, + "step": 19554 + }, + { + "epoch": 0.31288, + "grad_norm": 0.734375, + "learning_rate": 6.926774193548387e-05, + "loss": 0.1605, + "step": 19555 + }, + { + "epoch": 0.312896, + "grad_norm": 0.63671875, + "learning_rate": 6.926612903225806e-05, + "loss": 0.1337, + "step": 19556 + }, + { + "epoch": 0.312912, + "grad_norm": 1.25, + "learning_rate": 6.926451612903226e-05, + "loss": 0.1784, + "step": 19557 + }, + { + "epoch": 0.312928, + "grad_norm": 0.6796875, + "learning_rate": 6.926290322580645e-05, + "loss": 0.1767, + "step": 19558 + }, + { + "epoch": 0.312944, + "grad_norm": 0.6328125, + "learning_rate": 6.926129032258065e-05, + "loss": 0.1535, + "step": 19559 + }, + { + "epoch": 0.31296, + "grad_norm": 0.734375, + "learning_rate": 6.925967741935485e-05, + "loss": 0.1418, + "step": 19560 + }, + { + "epoch": 0.312976, + "grad_norm": 0.58984375, + "learning_rate": 6.925806451612905e-05, + "loss": 0.1655, + "step": 19561 + }, + { + "epoch": 0.312992, + "grad_norm": 0.90625, + "learning_rate": 6.925645161290323e-05, + "loss": 0.1673, + "step": 19562 + }, + { + "epoch": 0.313008, + "grad_norm": 0.81640625, + "learning_rate": 6.925483870967743e-05, + "loss": 0.1374, + "step": 19563 + }, + { + "epoch": 0.313024, + "grad_norm": 0.58984375, + "learning_rate": 6.925322580645162e-05, + "loss": 0.1597, + "step": 19564 + }, + { + "epoch": 0.31304, + "grad_norm": 0.6875, + "learning_rate": 6.92516129032258e-05, + "loss": 0.201, + "step": 19565 + }, + { + "epoch": 0.313056, + "grad_norm": 0.875, + "learning_rate": 6.925e-05, + "loss": 0.1865, + "step": 19566 + }, + { + "epoch": 0.313072, + "grad_norm": 0.875, + "learning_rate": 6.924838709677419e-05, + "loss": 0.1788, + "step": 19567 + }, + { + "epoch": 0.313088, + "grad_norm": 0.65625, + "learning_rate": 6.924677419354839e-05, + "loss": 0.1736, + "step": 19568 + }, + { + "epoch": 0.313104, + "grad_norm": 0.6796875, + "learning_rate": 6.924516129032257e-05, + "loss": 0.1645, + "step": 19569 + }, + { + "epoch": 0.31312, + "grad_norm": 0.404296875, + "learning_rate": 6.924354838709677e-05, + "loss": 0.1232, + "step": 19570 + }, + { + "epoch": 0.313136, + "grad_norm": 0.7109375, + "learning_rate": 6.924193548387097e-05, + "loss": 0.1877, + "step": 19571 + }, + { + "epoch": 0.313152, + "grad_norm": 0.83984375, + "learning_rate": 6.924032258064517e-05, + "loss": 0.1696, + "step": 19572 + }, + { + "epoch": 0.313168, + "grad_norm": 0.7265625, + "learning_rate": 6.923870967741936e-05, + "loss": 0.1598, + "step": 19573 + }, + { + "epoch": 0.313184, + "grad_norm": 0.69140625, + "learning_rate": 6.923709677419356e-05, + "loss": 0.1764, + "step": 19574 + }, + { + "epoch": 0.3132, + "grad_norm": 0.765625, + "learning_rate": 6.923548387096775e-05, + "loss": 0.1472, + "step": 19575 + }, + { + "epoch": 0.313216, + "grad_norm": 0.58984375, + "learning_rate": 6.923387096774194e-05, + "loss": 0.155, + "step": 19576 + }, + { + "epoch": 0.313232, + "grad_norm": 0.7578125, + "learning_rate": 6.923225806451613e-05, + "loss": 0.1902, + "step": 19577 + }, + { + "epoch": 0.313248, + "grad_norm": 0.6875, + "learning_rate": 6.923064516129033e-05, + "loss": 0.1755, + "step": 19578 + }, + { + "epoch": 0.313264, + "grad_norm": 0.91796875, + "learning_rate": 6.922903225806452e-05, + "loss": 0.14, + "step": 19579 + }, + { + "epoch": 0.31328, + "grad_norm": 0.6875, + "learning_rate": 6.922741935483872e-05, + "loss": 0.1684, + "step": 19580 + }, + { + "epoch": 0.313296, + "grad_norm": 0.6953125, + "learning_rate": 6.92258064516129e-05, + "loss": 0.1471, + "step": 19581 + }, + { + "epoch": 0.313312, + "grad_norm": 0.73828125, + "learning_rate": 6.922419354838709e-05, + "loss": 0.1572, + "step": 19582 + }, + { + "epoch": 0.313328, + "grad_norm": 0.703125, + "learning_rate": 6.922258064516129e-05, + "loss": 0.1436, + "step": 19583 + }, + { + "epoch": 0.313344, + "grad_norm": 0.75390625, + "learning_rate": 6.922096774193549e-05, + "loss": 0.1765, + "step": 19584 + }, + { + "epoch": 0.31336, + "grad_norm": 0.9453125, + "learning_rate": 6.921935483870969e-05, + "loss": 0.1632, + "step": 19585 + }, + { + "epoch": 0.313376, + "grad_norm": 0.875, + "learning_rate": 6.921774193548387e-05, + "loss": 0.1596, + "step": 19586 + }, + { + "epoch": 0.313392, + "grad_norm": 0.59375, + "learning_rate": 6.921612903225807e-05, + "loss": 0.1755, + "step": 19587 + }, + { + "epoch": 0.313408, + "grad_norm": 0.95703125, + "learning_rate": 6.921451612903226e-05, + "loss": 0.181, + "step": 19588 + }, + { + "epoch": 0.313424, + "grad_norm": 0.98828125, + "learning_rate": 6.921290322580646e-05, + "loss": 0.1577, + "step": 19589 + }, + { + "epoch": 0.31344, + "grad_norm": 0.66015625, + "learning_rate": 6.921129032258064e-05, + "loss": 0.1855, + "step": 19590 + }, + { + "epoch": 0.313456, + "grad_norm": 0.63671875, + "learning_rate": 6.920967741935484e-05, + "loss": 0.193, + "step": 19591 + }, + { + "epoch": 0.313472, + "grad_norm": 0.9453125, + "learning_rate": 6.920806451612903e-05, + "loss": 0.167, + "step": 19592 + }, + { + "epoch": 0.313488, + "grad_norm": 1.1328125, + "learning_rate": 6.920645161290323e-05, + "loss": 0.2335, + "step": 19593 + }, + { + "epoch": 0.313504, + "grad_norm": 0.7265625, + "learning_rate": 6.920483870967742e-05, + "loss": 0.1689, + "step": 19594 + }, + { + "epoch": 0.31352, + "grad_norm": 0.95703125, + "learning_rate": 6.920322580645162e-05, + "loss": 0.1962, + "step": 19595 + }, + { + "epoch": 0.313536, + "grad_norm": 0.78125, + "learning_rate": 6.920161290322582e-05, + "loss": 0.1275, + "step": 19596 + }, + { + "epoch": 0.313552, + "grad_norm": 0.6640625, + "learning_rate": 6.92e-05, + "loss": 0.1394, + "step": 19597 + }, + { + "epoch": 0.313568, + "grad_norm": 0.828125, + "learning_rate": 6.91983870967742e-05, + "loss": 0.1864, + "step": 19598 + }, + { + "epoch": 0.313584, + "grad_norm": 0.64453125, + "learning_rate": 6.919677419354839e-05, + "loss": 0.1744, + "step": 19599 + }, + { + "epoch": 0.3136, + "grad_norm": 1.125, + "learning_rate": 6.919516129032259e-05, + "loss": 0.2461, + "step": 19600 + }, + { + "epoch": 0.313616, + "grad_norm": 0.67578125, + "learning_rate": 6.919354838709677e-05, + "loss": 0.1689, + "step": 19601 + }, + { + "epoch": 0.313632, + "grad_norm": 0.62109375, + "learning_rate": 6.919193548387097e-05, + "loss": 0.1739, + "step": 19602 + }, + { + "epoch": 0.313648, + "grad_norm": 1.046875, + "learning_rate": 6.919032258064516e-05, + "loss": 0.1738, + "step": 19603 + }, + { + "epoch": 0.313664, + "grad_norm": 0.7890625, + "learning_rate": 6.918870967741936e-05, + "loss": 0.1666, + "step": 19604 + }, + { + "epoch": 0.31368, + "grad_norm": 0.6484375, + "learning_rate": 6.918709677419354e-05, + "loss": 0.1333, + "step": 19605 + }, + { + "epoch": 0.313696, + "grad_norm": 0.81640625, + "learning_rate": 6.918548387096774e-05, + "loss": 0.1964, + "step": 19606 + }, + { + "epoch": 0.313712, + "grad_norm": 0.69140625, + "learning_rate": 6.918387096774194e-05, + "loss": 0.1451, + "step": 19607 + }, + { + "epoch": 0.313728, + "grad_norm": 1.078125, + "learning_rate": 6.918225806451614e-05, + "loss": 0.1695, + "step": 19608 + }, + { + "epoch": 0.313744, + "grad_norm": 0.7421875, + "learning_rate": 6.918064516129033e-05, + "loss": 0.1679, + "step": 19609 + }, + { + "epoch": 0.31376, + "grad_norm": 0.65625, + "learning_rate": 6.917903225806453e-05, + "loss": 0.1568, + "step": 19610 + }, + { + "epoch": 0.313776, + "grad_norm": 0.96875, + "learning_rate": 6.917741935483872e-05, + "loss": 0.1796, + "step": 19611 + }, + { + "epoch": 0.313792, + "grad_norm": 0.9921875, + "learning_rate": 6.91758064516129e-05, + "loss": 0.1961, + "step": 19612 + }, + { + "epoch": 0.313808, + "grad_norm": 0.9921875, + "learning_rate": 6.91741935483871e-05, + "loss": 0.1702, + "step": 19613 + }, + { + "epoch": 0.313824, + "grad_norm": 0.625, + "learning_rate": 6.917258064516129e-05, + "loss": 0.1619, + "step": 19614 + }, + { + "epoch": 0.31384, + "grad_norm": 1.03125, + "learning_rate": 6.917096774193549e-05, + "loss": 0.2073, + "step": 19615 + }, + { + "epoch": 0.313856, + "grad_norm": 1.015625, + "learning_rate": 6.916935483870967e-05, + "loss": 0.1776, + "step": 19616 + }, + { + "epoch": 0.313872, + "grad_norm": 0.546875, + "learning_rate": 6.916774193548387e-05, + "loss": 0.1385, + "step": 19617 + }, + { + "epoch": 0.313888, + "grad_norm": 1.015625, + "learning_rate": 6.916612903225806e-05, + "loss": 0.1684, + "step": 19618 + }, + { + "epoch": 0.313904, + "grad_norm": 0.72265625, + "learning_rate": 6.916451612903226e-05, + "loss": 0.1746, + "step": 19619 + }, + { + "epoch": 0.31392, + "grad_norm": 1.3046875, + "learning_rate": 6.916290322580646e-05, + "loss": 0.1665, + "step": 19620 + }, + { + "epoch": 0.313936, + "grad_norm": 0.578125, + "learning_rate": 6.916129032258066e-05, + "loss": 0.138, + "step": 19621 + }, + { + "epoch": 0.313952, + "grad_norm": 0.87109375, + "learning_rate": 6.915967741935484e-05, + "loss": 0.2012, + "step": 19622 + }, + { + "epoch": 0.313968, + "grad_norm": 0.73828125, + "learning_rate": 6.915806451612904e-05, + "loss": 0.1705, + "step": 19623 + }, + { + "epoch": 0.313984, + "grad_norm": 0.75, + "learning_rate": 6.915645161290323e-05, + "loss": 0.1595, + "step": 19624 + }, + { + "epoch": 0.314, + "grad_norm": 0.65234375, + "learning_rate": 6.915483870967743e-05, + "loss": 0.1437, + "step": 19625 + }, + { + "epoch": 0.314016, + "grad_norm": 0.515625, + "learning_rate": 6.915322580645161e-05, + "loss": 0.1616, + "step": 19626 + }, + { + "epoch": 0.314032, + "grad_norm": 0.54296875, + "learning_rate": 6.915161290322581e-05, + "loss": 0.1888, + "step": 19627 + }, + { + "epoch": 0.314048, + "grad_norm": 0.8359375, + "learning_rate": 6.915e-05, + "loss": 0.1998, + "step": 19628 + }, + { + "epoch": 0.314064, + "grad_norm": 0.80078125, + "learning_rate": 6.914838709677419e-05, + "loss": 0.2305, + "step": 19629 + }, + { + "epoch": 0.31408, + "grad_norm": 0.81640625, + "learning_rate": 6.914677419354839e-05, + "loss": 0.1742, + "step": 19630 + }, + { + "epoch": 0.314096, + "grad_norm": 1.03125, + "learning_rate": 6.914516129032259e-05, + "loss": 0.1995, + "step": 19631 + }, + { + "epoch": 0.314112, + "grad_norm": 0.515625, + "learning_rate": 6.914354838709679e-05, + "loss": 0.1427, + "step": 19632 + }, + { + "epoch": 0.314128, + "grad_norm": 0.91015625, + "learning_rate": 6.914193548387097e-05, + "loss": 0.2177, + "step": 19633 + }, + { + "epoch": 0.314144, + "grad_norm": 0.890625, + "learning_rate": 6.914032258064517e-05, + "loss": 0.1535, + "step": 19634 + }, + { + "epoch": 0.31416, + "grad_norm": 0.62890625, + "learning_rate": 6.913870967741936e-05, + "loss": 0.1632, + "step": 19635 + }, + { + "epoch": 0.314176, + "grad_norm": 0.7734375, + "learning_rate": 6.913709677419356e-05, + "loss": 0.1648, + "step": 19636 + }, + { + "epoch": 0.314192, + "grad_norm": 0.890625, + "learning_rate": 6.913548387096774e-05, + "loss": 0.2187, + "step": 19637 + }, + { + "epoch": 0.314208, + "grad_norm": 0.71875, + "learning_rate": 6.913387096774194e-05, + "loss": 0.1922, + "step": 19638 + }, + { + "epoch": 0.314224, + "grad_norm": 0.87109375, + "learning_rate": 6.913225806451613e-05, + "loss": 0.1792, + "step": 19639 + }, + { + "epoch": 0.31424, + "grad_norm": 0.72265625, + "learning_rate": 6.913064516129033e-05, + "loss": 0.152, + "step": 19640 + }, + { + "epoch": 0.314256, + "grad_norm": 0.8828125, + "learning_rate": 6.912903225806451e-05, + "loss": 0.1608, + "step": 19641 + }, + { + "epoch": 0.314272, + "grad_norm": 0.76953125, + "learning_rate": 6.912741935483871e-05, + "loss": 0.1799, + "step": 19642 + }, + { + "epoch": 0.314288, + "grad_norm": 0.6796875, + "learning_rate": 6.912580645161291e-05, + "loss": 0.1785, + "step": 19643 + }, + { + "epoch": 0.314304, + "grad_norm": 1.078125, + "learning_rate": 6.91241935483871e-05, + "loss": 0.1339, + "step": 19644 + }, + { + "epoch": 0.31432, + "grad_norm": 0.69921875, + "learning_rate": 6.91225806451613e-05, + "loss": 0.1701, + "step": 19645 + }, + { + "epoch": 0.314336, + "grad_norm": 0.94921875, + "learning_rate": 6.912096774193549e-05, + "loss": 0.2, + "step": 19646 + }, + { + "epoch": 0.314352, + "grad_norm": 1.578125, + "learning_rate": 6.911935483870968e-05, + "loss": 0.1993, + "step": 19647 + }, + { + "epoch": 0.314368, + "grad_norm": 0.6796875, + "learning_rate": 6.911774193548387e-05, + "loss": 0.1405, + "step": 19648 + }, + { + "epoch": 0.314384, + "grad_norm": 1.03125, + "learning_rate": 6.911612903225807e-05, + "loss": 0.201, + "step": 19649 + }, + { + "epoch": 0.3144, + "grad_norm": 0.67578125, + "learning_rate": 6.911451612903226e-05, + "loss": 0.1679, + "step": 19650 + }, + { + "epoch": 0.314416, + "grad_norm": 0.8984375, + "learning_rate": 6.911290322580646e-05, + "loss": 0.1816, + "step": 19651 + }, + { + "epoch": 0.314432, + "grad_norm": 1.140625, + "learning_rate": 6.911129032258064e-05, + "loss": 0.1354, + "step": 19652 + }, + { + "epoch": 0.314448, + "grad_norm": 0.70703125, + "learning_rate": 6.910967741935484e-05, + "loss": 0.1631, + "step": 19653 + }, + { + "epoch": 0.314464, + "grad_norm": 0.84375, + "learning_rate": 6.910806451612903e-05, + "loss": 0.2112, + "step": 19654 + }, + { + "epoch": 0.31448, + "grad_norm": 0.50390625, + "learning_rate": 6.910645161290323e-05, + "loss": 0.1604, + "step": 19655 + }, + { + "epoch": 0.314496, + "grad_norm": 0.72265625, + "learning_rate": 6.910483870967743e-05, + "loss": 0.1444, + "step": 19656 + }, + { + "epoch": 0.314512, + "grad_norm": 0.57421875, + "learning_rate": 6.910322580645163e-05, + "loss": 0.1504, + "step": 19657 + }, + { + "epoch": 0.314528, + "grad_norm": 0.578125, + "learning_rate": 6.910161290322581e-05, + "loss": 0.1328, + "step": 19658 + }, + { + "epoch": 0.314544, + "grad_norm": 1.0234375, + "learning_rate": 6.91e-05, + "loss": 0.1334, + "step": 19659 + }, + { + "epoch": 0.31456, + "grad_norm": 0.87109375, + "learning_rate": 6.90983870967742e-05, + "loss": 0.1734, + "step": 19660 + }, + { + "epoch": 0.314576, + "grad_norm": 0.95703125, + "learning_rate": 6.909677419354838e-05, + "loss": 0.1661, + "step": 19661 + }, + { + "epoch": 0.314592, + "grad_norm": 0.98828125, + "learning_rate": 6.909516129032258e-05, + "loss": 0.1759, + "step": 19662 + }, + { + "epoch": 0.314608, + "grad_norm": 0.640625, + "learning_rate": 6.909354838709677e-05, + "loss": 0.197, + "step": 19663 + }, + { + "epoch": 0.314624, + "grad_norm": 0.64453125, + "learning_rate": 6.909193548387097e-05, + "loss": 0.1722, + "step": 19664 + }, + { + "epoch": 0.31464, + "grad_norm": 0.8046875, + "learning_rate": 6.909032258064516e-05, + "loss": 0.1554, + "step": 19665 + }, + { + "epoch": 0.314656, + "grad_norm": 0.734375, + "learning_rate": 6.908870967741936e-05, + "loss": 0.1609, + "step": 19666 + }, + { + "epoch": 0.314672, + "grad_norm": 1.5390625, + "learning_rate": 6.908709677419356e-05, + "loss": 0.2141, + "step": 19667 + }, + { + "epoch": 0.314688, + "grad_norm": 0.7421875, + "learning_rate": 6.908548387096776e-05, + "loss": 0.1707, + "step": 19668 + }, + { + "epoch": 0.314704, + "grad_norm": 0.734375, + "learning_rate": 6.908387096774194e-05, + "loss": 0.1693, + "step": 19669 + }, + { + "epoch": 0.31472, + "grad_norm": 0.6875, + "learning_rate": 6.908225806451614e-05, + "loss": 0.1523, + "step": 19670 + }, + { + "epoch": 0.314736, + "grad_norm": 0.83984375, + "learning_rate": 6.908064516129033e-05, + "loss": 0.2278, + "step": 19671 + }, + { + "epoch": 0.314752, + "grad_norm": 0.8046875, + "learning_rate": 6.907903225806453e-05, + "loss": 0.1641, + "step": 19672 + }, + { + "epoch": 0.314768, + "grad_norm": 0.8828125, + "learning_rate": 6.907741935483871e-05, + "loss": 0.183, + "step": 19673 + }, + { + "epoch": 0.314784, + "grad_norm": 1.203125, + "learning_rate": 6.90758064516129e-05, + "loss": 0.1953, + "step": 19674 + }, + { + "epoch": 0.3148, + "grad_norm": 0.94140625, + "learning_rate": 6.90741935483871e-05, + "loss": 0.1476, + "step": 19675 + }, + { + "epoch": 0.314816, + "grad_norm": 0.5390625, + "learning_rate": 6.907258064516128e-05, + "loss": 0.1437, + "step": 19676 + }, + { + "epoch": 0.314832, + "grad_norm": 0.60546875, + "learning_rate": 6.907096774193548e-05, + "loss": 0.1575, + "step": 19677 + }, + { + "epoch": 0.314848, + "grad_norm": 1.21875, + "learning_rate": 6.906935483870967e-05, + "loss": 0.197, + "step": 19678 + }, + { + "epoch": 0.314864, + "grad_norm": 0.94921875, + "learning_rate": 6.906774193548387e-05, + "loss": 0.1772, + "step": 19679 + }, + { + "epoch": 0.31488, + "grad_norm": 0.8125, + "learning_rate": 6.906612903225807e-05, + "loss": 0.1733, + "step": 19680 + }, + { + "epoch": 0.314896, + "grad_norm": 0.68359375, + "learning_rate": 6.906451612903227e-05, + "loss": 0.1806, + "step": 19681 + }, + { + "epoch": 0.314912, + "grad_norm": 0.8125, + "learning_rate": 6.906290322580646e-05, + "loss": 0.2068, + "step": 19682 + }, + { + "epoch": 0.314928, + "grad_norm": 1.0390625, + "learning_rate": 6.906129032258065e-05, + "loss": 0.1515, + "step": 19683 + }, + { + "epoch": 0.314944, + "grad_norm": 0.9921875, + "learning_rate": 6.905967741935484e-05, + "loss": 0.2088, + "step": 19684 + }, + { + "epoch": 0.31496, + "grad_norm": 0.55078125, + "learning_rate": 6.905806451612904e-05, + "loss": 0.122, + "step": 19685 + }, + { + "epoch": 0.314976, + "grad_norm": 0.7265625, + "learning_rate": 6.905645161290323e-05, + "loss": 0.1862, + "step": 19686 + }, + { + "epoch": 0.314992, + "grad_norm": 1.015625, + "learning_rate": 6.905483870967743e-05, + "loss": 0.1752, + "step": 19687 + }, + { + "epoch": 0.315008, + "grad_norm": 0.92578125, + "learning_rate": 6.905322580645161e-05, + "loss": 0.1541, + "step": 19688 + }, + { + "epoch": 0.315024, + "grad_norm": 0.671875, + "learning_rate": 6.905161290322581e-05, + "loss": 0.1624, + "step": 19689 + }, + { + "epoch": 0.31504, + "grad_norm": 0.6015625, + "learning_rate": 6.905e-05, + "loss": 0.1912, + "step": 19690 + }, + { + "epoch": 0.315056, + "grad_norm": 0.63671875, + "learning_rate": 6.90483870967742e-05, + "loss": 0.2061, + "step": 19691 + }, + { + "epoch": 0.315072, + "grad_norm": 0.83203125, + "learning_rate": 6.90467741935484e-05, + "loss": 0.18, + "step": 19692 + }, + { + "epoch": 0.315088, + "grad_norm": 0.6015625, + "learning_rate": 6.904516129032258e-05, + "loss": 0.1711, + "step": 19693 + }, + { + "epoch": 0.315104, + "grad_norm": 0.87109375, + "learning_rate": 6.904354838709678e-05, + "loss": 0.1651, + "step": 19694 + }, + { + "epoch": 0.31512, + "grad_norm": 0.75390625, + "learning_rate": 6.904193548387097e-05, + "loss": 0.1917, + "step": 19695 + }, + { + "epoch": 0.315136, + "grad_norm": 0.625, + "learning_rate": 6.904032258064517e-05, + "loss": 0.1562, + "step": 19696 + }, + { + "epoch": 0.315152, + "grad_norm": 0.765625, + "learning_rate": 6.903870967741935e-05, + "loss": 0.1437, + "step": 19697 + }, + { + "epoch": 0.315168, + "grad_norm": 0.76953125, + "learning_rate": 6.903709677419355e-05, + "loss": 0.1241, + "step": 19698 + }, + { + "epoch": 0.315184, + "grad_norm": 1.2421875, + "learning_rate": 6.903548387096774e-05, + "loss": 0.1749, + "step": 19699 + }, + { + "epoch": 0.3152, + "grad_norm": 0.9453125, + "learning_rate": 6.903387096774194e-05, + "loss": 0.1396, + "step": 19700 + }, + { + "epoch": 0.315216, + "grad_norm": 0.671875, + "learning_rate": 6.903225806451613e-05, + "loss": 0.1314, + "step": 19701 + }, + { + "epoch": 0.315232, + "grad_norm": 0.6640625, + "learning_rate": 6.903064516129033e-05, + "loss": 0.1478, + "step": 19702 + }, + { + "epoch": 0.315248, + "grad_norm": 1.3046875, + "learning_rate": 6.902903225806453e-05, + "loss": 0.1804, + "step": 19703 + }, + { + "epoch": 0.315264, + "grad_norm": 0.7109375, + "learning_rate": 6.902741935483872e-05, + "loss": 0.2158, + "step": 19704 + }, + { + "epoch": 0.31528, + "grad_norm": 0.68359375, + "learning_rate": 6.902580645161291e-05, + "loss": 0.2004, + "step": 19705 + }, + { + "epoch": 0.315296, + "grad_norm": 0.66015625, + "learning_rate": 6.90241935483871e-05, + "loss": 0.1477, + "step": 19706 + }, + { + "epoch": 0.315312, + "grad_norm": 0.671875, + "learning_rate": 6.90225806451613e-05, + "loss": 0.151, + "step": 19707 + }, + { + "epoch": 0.315328, + "grad_norm": 0.63671875, + "learning_rate": 6.902096774193548e-05, + "loss": 0.1898, + "step": 19708 + }, + { + "epoch": 0.315344, + "grad_norm": 0.89453125, + "learning_rate": 6.901935483870968e-05, + "loss": 0.1429, + "step": 19709 + }, + { + "epoch": 0.31536, + "grad_norm": 0.96484375, + "learning_rate": 6.901774193548387e-05, + "loss": 0.1668, + "step": 19710 + }, + { + "epoch": 0.315376, + "grad_norm": 0.87109375, + "learning_rate": 6.901612903225807e-05, + "loss": 0.1792, + "step": 19711 + }, + { + "epoch": 0.315392, + "grad_norm": 0.69921875, + "learning_rate": 6.901451612903225e-05, + "loss": 0.1592, + "step": 19712 + }, + { + "epoch": 0.315408, + "grad_norm": 0.796875, + "learning_rate": 6.901290322580645e-05, + "loss": 0.1799, + "step": 19713 + }, + { + "epoch": 0.315424, + "grad_norm": 0.96875, + "learning_rate": 6.901129032258064e-05, + "loss": 0.1667, + "step": 19714 + }, + { + "epoch": 0.31544, + "grad_norm": 0.84765625, + "learning_rate": 6.900967741935484e-05, + "loss": 0.1491, + "step": 19715 + }, + { + "epoch": 0.315456, + "grad_norm": 0.71484375, + "learning_rate": 6.900806451612904e-05, + "loss": 0.1612, + "step": 19716 + }, + { + "epoch": 0.315472, + "grad_norm": 0.84375, + "learning_rate": 6.900645161290324e-05, + "loss": 0.1774, + "step": 19717 + }, + { + "epoch": 0.315488, + "grad_norm": 0.921875, + "learning_rate": 6.900483870967742e-05, + "loss": 0.2067, + "step": 19718 + }, + { + "epoch": 0.315504, + "grad_norm": 0.625, + "learning_rate": 6.900322580645162e-05, + "loss": 0.1659, + "step": 19719 + }, + { + "epoch": 0.31552, + "grad_norm": 0.96875, + "learning_rate": 6.900161290322581e-05, + "loss": 0.1554, + "step": 19720 + }, + { + "epoch": 0.315536, + "grad_norm": 0.93359375, + "learning_rate": 6.9e-05, + "loss": 0.2159, + "step": 19721 + }, + { + "epoch": 0.315552, + "grad_norm": 0.6015625, + "learning_rate": 6.89983870967742e-05, + "loss": 0.2013, + "step": 19722 + }, + { + "epoch": 0.315568, + "grad_norm": 1.0703125, + "learning_rate": 6.899677419354838e-05, + "loss": 0.1321, + "step": 19723 + }, + { + "epoch": 0.315584, + "grad_norm": 1.0390625, + "learning_rate": 6.899516129032258e-05, + "loss": 0.1638, + "step": 19724 + }, + { + "epoch": 0.3156, + "grad_norm": 0.765625, + "learning_rate": 6.899354838709677e-05, + "loss": 0.1402, + "step": 19725 + }, + { + "epoch": 0.315616, + "grad_norm": 0.86328125, + "learning_rate": 6.899193548387097e-05, + "loss": 0.156, + "step": 19726 + }, + { + "epoch": 0.315632, + "grad_norm": 0.7890625, + "learning_rate": 6.899032258064517e-05, + "loss": 0.1965, + "step": 19727 + }, + { + "epoch": 0.315648, + "grad_norm": 0.90234375, + "learning_rate": 6.898870967741937e-05, + "loss": 0.2053, + "step": 19728 + }, + { + "epoch": 0.315664, + "grad_norm": 1.1328125, + "learning_rate": 6.898709677419355e-05, + "loss": 0.175, + "step": 19729 + }, + { + "epoch": 0.31568, + "grad_norm": 0.61328125, + "learning_rate": 6.898548387096775e-05, + "loss": 0.1753, + "step": 19730 + }, + { + "epoch": 0.315696, + "grad_norm": 0.5546875, + "learning_rate": 6.898387096774194e-05, + "loss": 0.1495, + "step": 19731 + }, + { + "epoch": 0.315712, + "grad_norm": 1.0078125, + "learning_rate": 6.898225806451614e-05, + "loss": 0.209, + "step": 19732 + }, + { + "epoch": 0.315728, + "grad_norm": 1.03125, + "learning_rate": 6.898064516129032e-05, + "loss": 0.1612, + "step": 19733 + }, + { + "epoch": 0.315744, + "grad_norm": 0.69921875, + "learning_rate": 6.897903225806452e-05, + "loss": 0.1283, + "step": 19734 + }, + { + "epoch": 0.31576, + "grad_norm": 0.86328125, + "learning_rate": 6.897741935483871e-05, + "loss": 0.1644, + "step": 19735 + }, + { + "epoch": 0.315776, + "grad_norm": 0.828125, + "learning_rate": 6.89758064516129e-05, + "loss": 0.1394, + "step": 19736 + }, + { + "epoch": 0.315792, + "grad_norm": 0.58203125, + "learning_rate": 6.89741935483871e-05, + "loss": 0.1251, + "step": 19737 + }, + { + "epoch": 0.315808, + "grad_norm": 0.5625, + "learning_rate": 6.89725806451613e-05, + "loss": 0.1789, + "step": 19738 + }, + { + "epoch": 0.315824, + "grad_norm": 0.72265625, + "learning_rate": 6.897096774193548e-05, + "loss": 0.1641, + "step": 19739 + }, + { + "epoch": 0.31584, + "grad_norm": 0.8828125, + "learning_rate": 6.896935483870968e-05, + "loss": 0.1712, + "step": 19740 + }, + { + "epoch": 0.315856, + "grad_norm": 0.79296875, + "learning_rate": 6.896774193548388e-05, + "loss": 0.1467, + "step": 19741 + }, + { + "epoch": 0.315872, + "grad_norm": 0.61328125, + "learning_rate": 6.896612903225807e-05, + "loss": 0.1232, + "step": 19742 + }, + { + "epoch": 0.315888, + "grad_norm": 0.76953125, + "learning_rate": 6.896451612903227e-05, + "loss": 0.1536, + "step": 19743 + }, + { + "epoch": 0.315904, + "grad_norm": 0.8515625, + "learning_rate": 6.896290322580645e-05, + "loss": 0.157, + "step": 19744 + }, + { + "epoch": 0.31592, + "grad_norm": 0.6796875, + "learning_rate": 6.896129032258065e-05, + "loss": 0.1677, + "step": 19745 + }, + { + "epoch": 0.315936, + "grad_norm": 0.921875, + "learning_rate": 6.895967741935484e-05, + "loss": 0.1687, + "step": 19746 + }, + { + "epoch": 0.315952, + "grad_norm": 0.7578125, + "learning_rate": 6.895806451612904e-05, + "loss": 0.149, + "step": 19747 + }, + { + "epoch": 0.315968, + "grad_norm": 0.69921875, + "learning_rate": 6.895645161290322e-05, + "loss": 0.1175, + "step": 19748 + }, + { + "epoch": 0.315984, + "grad_norm": 0.78515625, + "learning_rate": 6.895483870967742e-05, + "loss": 0.1638, + "step": 19749 + }, + { + "epoch": 0.316, + "grad_norm": 0.9140625, + "learning_rate": 6.895322580645161e-05, + "loss": 0.1325, + "step": 19750 + }, + { + "epoch": 0.316016, + "grad_norm": 0.6796875, + "learning_rate": 6.895161290322581e-05, + "loss": 0.1506, + "step": 19751 + }, + { + "epoch": 0.316032, + "grad_norm": 0.92578125, + "learning_rate": 6.895000000000001e-05, + "loss": 0.2144, + "step": 19752 + }, + { + "epoch": 0.316048, + "grad_norm": 0.71875, + "learning_rate": 6.89483870967742e-05, + "loss": 0.1651, + "step": 19753 + }, + { + "epoch": 0.316064, + "grad_norm": 0.6640625, + "learning_rate": 6.89467741935484e-05, + "loss": 0.1901, + "step": 19754 + }, + { + "epoch": 0.31608, + "grad_norm": 0.91015625, + "learning_rate": 6.894516129032258e-05, + "loss": 0.2015, + "step": 19755 + }, + { + "epoch": 0.316096, + "grad_norm": 0.76171875, + "learning_rate": 6.894354838709678e-05, + "loss": 0.1662, + "step": 19756 + }, + { + "epoch": 0.316112, + "grad_norm": 1.2421875, + "learning_rate": 6.894193548387097e-05, + "loss": 0.1871, + "step": 19757 + }, + { + "epoch": 0.316128, + "grad_norm": 0.72265625, + "learning_rate": 6.894032258064517e-05, + "loss": 0.1469, + "step": 19758 + }, + { + "epoch": 0.316144, + "grad_norm": 0.875, + "learning_rate": 6.893870967741935e-05, + "loss": 0.175, + "step": 19759 + }, + { + "epoch": 0.31616, + "grad_norm": 0.9140625, + "learning_rate": 6.893709677419355e-05, + "loss": 0.1695, + "step": 19760 + }, + { + "epoch": 0.316176, + "grad_norm": 0.6171875, + "learning_rate": 6.893548387096774e-05, + "loss": 0.1937, + "step": 19761 + }, + { + "epoch": 0.316192, + "grad_norm": 0.78515625, + "learning_rate": 6.893387096774194e-05, + "loss": 0.1739, + "step": 19762 + }, + { + "epoch": 0.316208, + "grad_norm": 0.60546875, + "learning_rate": 6.893225806451614e-05, + "loss": 0.154, + "step": 19763 + }, + { + "epoch": 0.316224, + "grad_norm": 0.73828125, + "learning_rate": 6.893064516129034e-05, + "loss": 0.1554, + "step": 19764 + }, + { + "epoch": 0.31624, + "grad_norm": 0.92578125, + "learning_rate": 6.892903225806452e-05, + "loss": 0.1713, + "step": 19765 + }, + { + "epoch": 0.316256, + "grad_norm": 0.64453125, + "learning_rate": 6.892741935483872e-05, + "loss": 0.1649, + "step": 19766 + }, + { + "epoch": 0.316272, + "grad_norm": 0.92578125, + "learning_rate": 6.892580645161291e-05, + "loss": 0.1702, + "step": 19767 + }, + { + "epoch": 0.316288, + "grad_norm": 0.859375, + "learning_rate": 6.89241935483871e-05, + "loss": 0.1492, + "step": 19768 + }, + { + "epoch": 0.316304, + "grad_norm": 0.8359375, + "learning_rate": 6.89225806451613e-05, + "loss": 0.1891, + "step": 19769 + }, + { + "epoch": 0.31632, + "grad_norm": 0.8125, + "learning_rate": 6.892096774193548e-05, + "loss": 0.1762, + "step": 19770 + }, + { + "epoch": 0.316336, + "grad_norm": 0.75390625, + "learning_rate": 6.891935483870968e-05, + "loss": 0.1754, + "step": 19771 + }, + { + "epoch": 0.316352, + "grad_norm": 0.609375, + "learning_rate": 6.891774193548387e-05, + "loss": 0.1773, + "step": 19772 + }, + { + "epoch": 0.316368, + "grad_norm": 1.234375, + "learning_rate": 6.891612903225807e-05, + "loss": 0.166, + "step": 19773 + }, + { + "epoch": 0.316384, + "grad_norm": 0.78515625, + "learning_rate": 6.891451612903225e-05, + "loss": 0.179, + "step": 19774 + }, + { + "epoch": 0.3164, + "grad_norm": 0.78515625, + "learning_rate": 6.891290322580645e-05, + "loss": 0.1639, + "step": 19775 + }, + { + "epoch": 0.316416, + "grad_norm": 0.75, + "learning_rate": 6.891129032258065e-05, + "loss": 0.1568, + "step": 19776 + }, + { + "epoch": 0.316432, + "grad_norm": 0.82421875, + "learning_rate": 6.890967741935485e-05, + "loss": 0.1729, + "step": 19777 + }, + { + "epoch": 0.316448, + "grad_norm": 0.984375, + "learning_rate": 6.890806451612904e-05, + "loss": 0.2051, + "step": 19778 + }, + { + "epoch": 0.316464, + "grad_norm": 1.0546875, + "learning_rate": 6.890645161290324e-05, + "loss": 0.1932, + "step": 19779 + }, + { + "epoch": 0.31648, + "grad_norm": 1.0703125, + "learning_rate": 6.890483870967742e-05, + "loss": 0.1795, + "step": 19780 + }, + { + "epoch": 0.316496, + "grad_norm": 0.7265625, + "learning_rate": 6.890322580645162e-05, + "loss": 0.1778, + "step": 19781 + }, + { + "epoch": 0.316512, + "grad_norm": 0.98828125, + "learning_rate": 6.890161290322581e-05, + "loss": 0.2322, + "step": 19782 + }, + { + "epoch": 0.316528, + "grad_norm": 0.5859375, + "learning_rate": 6.89e-05, + "loss": 0.1477, + "step": 19783 + }, + { + "epoch": 0.316544, + "grad_norm": 1.5, + "learning_rate": 6.88983870967742e-05, + "loss": 0.1934, + "step": 19784 + }, + { + "epoch": 0.31656, + "grad_norm": 0.8359375, + "learning_rate": 6.889677419354838e-05, + "loss": 0.2203, + "step": 19785 + }, + { + "epoch": 0.316576, + "grad_norm": 0.67578125, + "learning_rate": 6.889516129032258e-05, + "loss": 0.1921, + "step": 19786 + }, + { + "epoch": 0.316592, + "grad_norm": 0.69140625, + "learning_rate": 6.889354838709678e-05, + "loss": 0.1915, + "step": 19787 + }, + { + "epoch": 0.316608, + "grad_norm": 0.88671875, + "learning_rate": 6.889193548387098e-05, + "loss": 0.19, + "step": 19788 + }, + { + "epoch": 0.316624, + "grad_norm": 0.68359375, + "learning_rate": 6.889032258064516e-05, + "loss": 0.1919, + "step": 19789 + }, + { + "epoch": 0.31664, + "grad_norm": 0.61328125, + "learning_rate": 6.888870967741936e-05, + "loss": 0.1425, + "step": 19790 + }, + { + "epoch": 0.316656, + "grad_norm": 0.8125, + "learning_rate": 6.888709677419355e-05, + "loss": 0.167, + "step": 19791 + }, + { + "epoch": 0.316672, + "grad_norm": 0.7890625, + "learning_rate": 6.888548387096775e-05, + "loss": 0.1433, + "step": 19792 + }, + { + "epoch": 0.316688, + "grad_norm": 0.90625, + "learning_rate": 6.888387096774194e-05, + "loss": 0.1914, + "step": 19793 + }, + { + "epoch": 0.316704, + "grad_norm": 0.83984375, + "learning_rate": 6.888225806451614e-05, + "loss": 0.1719, + "step": 19794 + }, + { + "epoch": 0.31672, + "grad_norm": 0.75, + "learning_rate": 6.888064516129032e-05, + "loss": 0.151, + "step": 19795 + }, + { + "epoch": 0.316736, + "grad_norm": 0.51171875, + "learning_rate": 6.887903225806452e-05, + "loss": 0.1448, + "step": 19796 + }, + { + "epoch": 0.316752, + "grad_norm": 0.7109375, + "learning_rate": 6.887741935483871e-05, + "loss": 0.2106, + "step": 19797 + }, + { + "epoch": 0.316768, + "grad_norm": 0.640625, + "learning_rate": 6.887580645161291e-05, + "loss": 0.1975, + "step": 19798 + }, + { + "epoch": 0.316784, + "grad_norm": 0.88671875, + "learning_rate": 6.887419354838711e-05, + "loss": 0.1428, + "step": 19799 + }, + { + "epoch": 0.3168, + "grad_norm": 0.609375, + "learning_rate": 6.887258064516129e-05, + "loss": 0.1653, + "step": 19800 + }, + { + "epoch": 0.316816, + "grad_norm": 0.5703125, + "learning_rate": 6.887096774193549e-05, + "loss": 0.1453, + "step": 19801 + }, + { + "epoch": 0.316832, + "grad_norm": 0.71875, + "learning_rate": 6.886935483870968e-05, + "loss": 0.1469, + "step": 19802 + }, + { + "epoch": 0.316848, + "grad_norm": 0.69140625, + "learning_rate": 6.886774193548388e-05, + "loss": 0.1668, + "step": 19803 + }, + { + "epoch": 0.316864, + "grad_norm": 0.70703125, + "learning_rate": 6.886612903225806e-05, + "loss": 0.1787, + "step": 19804 + }, + { + "epoch": 0.31688, + "grad_norm": 0.6875, + "learning_rate": 6.886451612903226e-05, + "loss": 0.1838, + "step": 19805 + }, + { + "epoch": 0.316896, + "grad_norm": 0.51953125, + "learning_rate": 6.886290322580645e-05, + "loss": 0.1613, + "step": 19806 + }, + { + "epoch": 0.316912, + "grad_norm": 0.91015625, + "learning_rate": 6.886129032258065e-05, + "loss": 0.1578, + "step": 19807 + }, + { + "epoch": 0.316928, + "grad_norm": 1.6171875, + "learning_rate": 6.885967741935484e-05, + "loss": 0.2281, + "step": 19808 + }, + { + "epoch": 0.316944, + "grad_norm": 0.8359375, + "learning_rate": 6.885806451612904e-05, + "loss": 0.1446, + "step": 19809 + }, + { + "epoch": 0.31696, + "grad_norm": 1.5703125, + "learning_rate": 6.885645161290322e-05, + "loss": 0.1682, + "step": 19810 + }, + { + "epoch": 0.316976, + "grad_norm": 0.56640625, + "learning_rate": 6.885483870967742e-05, + "loss": 0.1601, + "step": 19811 + }, + { + "epoch": 0.316992, + "grad_norm": 0.703125, + "learning_rate": 6.885322580645162e-05, + "loss": 0.1696, + "step": 19812 + }, + { + "epoch": 0.317008, + "grad_norm": 0.828125, + "learning_rate": 6.885161290322582e-05, + "loss": 0.1687, + "step": 19813 + }, + { + "epoch": 0.317024, + "grad_norm": 1.0390625, + "learning_rate": 6.885e-05, + "loss": 0.1598, + "step": 19814 + }, + { + "epoch": 0.31704, + "grad_norm": 1.2578125, + "learning_rate": 6.884838709677419e-05, + "loss": 0.158, + "step": 19815 + }, + { + "epoch": 0.317056, + "grad_norm": 1.2421875, + "learning_rate": 6.884677419354839e-05, + "loss": 0.1538, + "step": 19816 + }, + { + "epoch": 0.317072, + "grad_norm": 0.80078125, + "learning_rate": 6.884516129032258e-05, + "loss": 0.1724, + "step": 19817 + }, + { + "epoch": 0.317088, + "grad_norm": 1.2265625, + "learning_rate": 6.884354838709678e-05, + "loss": 0.1957, + "step": 19818 + }, + { + "epoch": 0.317104, + "grad_norm": 0.875, + "learning_rate": 6.884193548387096e-05, + "loss": 0.1451, + "step": 19819 + }, + { + "epoch": 0.31712, + "grad_norm": 1.265625, + "learning_rate": 6.884032258064516e-05, + "loss": 0.1462, + "step": 19820 + }, + { + "epoch": 0.317136, + "grad_norm": 1.0859375, + "learning_rate": 6.883870967741935e-05, + "loss": 0.1673, + "step": 19821 + }, + { + "epoch": 0.317152, + "grad_norm": 0.62109375, + "learning_rate": 6.883709677419355e-05, + "loss": 0.1418, + "step": 19822 + }, + { + "epoch": 0.317168, + "grad_norm": 0.8359375, + "learning_rate": 6.883548387096775e-05, + "loss": 0.1384, + "step": 19823 + }, + { + "epoch": 0.317184, + "grad_norm": 1.0234375, + "learning_rate": 6.883387096774195e-05, + "loss": 0.1407, + "step": 19824 + }, + { + "epoch": 0.3172, + "grad_norm": 0.64453125, + "learning_rate": 6.883225806451613e-05, + "loss": 0.1668, + "step": 19825 + }, + { + "epoch": 0.317216, + "grad_norm": 0.77734375, + "learning_rate": 6.883064516129033e-05, + "loss": 0.198, + "step": 19826 + }, + { + "epoch": 0.317232, + "grad_norm": 0.62109375, + "learning_rate": 6.882903225806452e-05, + "loss": 0.1597, + "step": 19827 + }, + { + "epoch": 0.317248, + "grad_norm": 1.0234375, + "learning_rate": 6.882741935483872e-05, + "loss": 0.1556, + "step": 19828 + }, + { + "epoch": 0.317264, + "grad_norm": 0.76171875, + "learning_rate": 6.88258064516129e-05, + "loss": 0.1642, + "step": 19829 + }, + { + "epoch": 0.31728, + "grad_norm": 0.62890625, + "learning_rate": 6.882419354838709e-05, + "loss": 0.1825, + "step": 19830 + }, + { + "epoch": 0.317296, + "grad_norm": 0.734375, + "learning_rate": 6.882258064516129e-05, + "loss": 0.1832, + "step": 19831 + }, + { + "epoch": 0.317312, + "grad_norm": 0.67578125, + "learning_rate": 6.882096774193548e-05, + "loss": 0.1521, + "step": 19832 + }, + { + "epoch": 0.317328, + "grad_norm": 0.8125, + "learning_rate": 6.881935483870968e-05, + "loss": 0.1236, + "step": 19833 + }, + { + "epoch": 0.317344, + "grad_norm": 0.73828125, + "learning_rate": 6.881774193548386e-05, + "loss": 0.1832, + "step": 19834 + }, + { + "epoch": 0.31736, + "grad_norm": 0.7890625, + "learning_rate": 6.881612903225806e-05, + "loss": 0.1813, + "step": 19835 + }, + { + "epoch": 0.317376, + "grad_norm": 0.59765625, + "learning_rate": 6.881451612903226e-05, + "loss": 0.1528, + "step": 19836 + }, + { + "epoch": 0.317392, + "grad_norm": 0.82421875, + "learning_rate": 6.881290322580646e-05, + "loss": 0.1826, + "step": 19837 + }, + { + "epoch": 0.317408, + "grad_norm": 0.85546875, + "learning_rate": 6.881129032258065e-05, + "loss": 0.2377, + "step": 19838 + }, + { + "epoch": 0.317424, + "grad_norm": 0.58203125, + "learning_rate": 6.880967741935485e-05, + "loss": 0.1499, + "step": 19839 + }, + { + "epoch": 0.31744, + "grad_norm": 0.61328125, + "learning_rate": 6.880806451612903e-05, + "loss": 0.1587, + "step": 19840 + }, + { + "epoch": 0.317456, + "grad_norm": 0.65625, + "learning_rate": 6.880645161290323e-05, + "loss": 0.1711, + "step": 19841 + }, + { + "epoch": 0.317472, + "grad_norm": 0.625, + "learning_rate": 6.880483870967742e-05, + "loss": 0.1272, + "step": 19842 + }, + { + "epoch": 0.317488, + "grad_norm": 0.9140625, + "learning_rate": 6.880322580645162e-05, + "loss": 0.1491, + "step": 19843 + }, + { + "epoch": 0.317504, + "grad_norm": 1.171875, + "learning_rate": 6.88016129032258e-05, + "loss": 0.2352, + "step": 19844 + }, + { + "epoch": 0.31752, + "grad_norm": 0.609375, + "learning_rate": 6.879999999999999e-05, + "loss": 0.1545, + "step": 19845 + }, + { + "epoch": 0.317536, + "grad_norm": 0.68359375, + "learning_rate": 6.879838709677419e-05, + "loss": 0.1531, + "step": 19846 + }, + { + "epoch": 0.317552, + "grad_norm": 0.62890625, + "learning_rate": 6.879677419354839e-05, + "loss": 0.1554, + "step": 19847 + }, + { + "epoch": 0.317568, + "grad_norm": 1.4453125, + "learning_rate": 6.879516129032259e-05, + "loss": 0.1872, + "step": 19848 + }, + { + "epoch": 0.317584, + "grad_norm": 0.53515625, + "learning_rate": 6.879354838709678e-05, + "loss": 0.1519, + "step": 19849 + }, + { + "epoch": 0.3176, + "grad_norm": 0.859375, + "learning_rate": 6.879193548387098e-05, + "loss": 0.1649, + "step": 19850 + }, + { + "epoch": 0.317616, + "grad_norm": 1.3515625, + "learning_rate": 6.879032258064516e-05, + "loss": 0.1754, + "step": 19851 + }, + { + "epoch": 0.317632, + "grad_norm": 0.9296875, + "learning_rate": 6.878870967741936e-05, + "loss": 0.1736, + "step": 19852 + }, + { + "epoch": 0.317648, + "grad_norm": 0.61328125, + "learning_rate": 6.878709677419355e-05, + "loss": 0.1718, + "step": 19853 + }, + { + "epoch": 0.317664, + "grad_norm": 0.546875, + "learning_rate": 6.878548387096775e-05, + "loss": 0.136, + "step": 19854 + }, + { + "epoch": 0.31768, + "grad_norm": 0.78125, + "learning_rate": 6.878387096774193e-05, + "loss": 0.1434, + "step": 19855 + }, + { + "epoch": 0.317696, + "grad_norm": 1.0234375, + "learning_rate": 6.878225806451613e-05, + "loss": 0.2009, + "step": 19856 + }, + { + "epoch": 0.317712, + "grad_norm": 0.74609375, + "learning_rate": 6.878064516129032e-05, + "loss": 0.1727, + "step": 19857 + }, + { + "epoch": 0.317728, + "grad_norm": 1.125, + "learning_rate": 6.877903225806452e-05, + "loss": 0.1744, + "step": 19858 + }, + { + "epoch": 0.317744, + "grad_norm": 1.1796875, + "learning_rate": 6.877741935483872e-05, + "loss": 0.158, + "step": 19859 + }, + { + "epoch": 0.31776, + "grad_norm": 0.84765625, + "learning_rate": 6.877580645161292e-05, + "loss": 0.1576, + "step": 19860 + }, + { + "epoch": 0.317776, + "grad_norm": 1.15625, + "learning_rate": 6.87741935483871e-05, + "loss": 0.2222, + "step": 19861 + }, + { + "epoch": 0.317792, + "grad_norm": 0.82421875, + "learning_rate": 6.877258064516129e-05, + "loss": 0.2555, + "step": 19862 + }, + { + "epoch": 0.317808, + "grad_norm": 0.94140625, + "learning_rate": 6.877096774193549e-05, + "loss": 0.2015, + "step": 19863 + }, + { + "epoch": 0.317824, + "grad_norm": 0.69140625, + "learning_rate": 6.876935483870968e-05, + "loss": 0.1269, + "step": 19864 + }, + { + "epoch": 0.31784, + "grad_norm": 0.921875, + "learning_rate": 6.876774193548388e-05, + "loss": 0.1582, + "step": 19865 + }, + { + "epoch": 0.317856, + "grad_norm": 0.9296875, + "learning_rate": 6.876612903225806e-05, + "loss": 0.1606, + "step": 19866 + }, + { + "epoch": 0.317872, + "grad_norm": 0.5546875, + "learning_rate": 6.876451612903226e-05, + "loss": 0.1368, + "step": 19867 + }, + { + "epoch": 0.317888, + "grad_norm": 0.8828125, + "learning_rate": 6.876290322580645e-05, + "loss": 0.2088, + "step": 19868 + }, + { + "epoch": 0.317904, + "grad_norm": 0.734375, + "learning_rate": 6.876129032258065e-05, + "loss": 0.1368, + "step": 19869 + }, + { + "epoch": 0.31792, + "grad_norm": 0.84765625, + "learning_rate": 6.875967741935483e-05, + "loss": 0.1764, + "step": 19870 + }, + { + "epoch": 0.317936, + "grad_norm": 0.57421875, + "learning_rate": 6.875806451612903e-05, + "loss": 0.1838, + "step": 19871 + }, + { + "epoch": 0.317952, + "grad_norm": 0.7109375, + "learning_rate": 6.875645161290323e-05, + "loss": 0.1643, + "step": 19872 + }, + { + "epoch": 0.317968, + "grad_norm": 0.79296875, + "learning_rate": 6.875483870967743e-05, + "loss": 0.1423, + "step": 19873 + }, + { + "epoch": 0.317984, + "grad_norm": 0.99609375, + "learning_rate": 6.875322580645162e-05, + "loss": 0.1556, + "step": 19874 + }, + { + "epoch": 0.318, + "grad_norm": 0.625, + "learning_rate": 6.875161290322582e-05, + "loss": 0.1704, + "step": 19875 + }, + { + "epoch": 0.318016, + "grad_norm": 0.7421875, + "learning_rate": 6.875e-05, + "loss": 0.1675, + "step": 19876 + }, + { + "epoch": 0.318032, + "grad_norm": 0.62890625, + "learning_rate": 6.874838709677419e-05, + "loss": 0.1449, + "step": 19877 + }, + { + "epoch": 0.318048, + "grad_norm": 1.3203125, + "learning_rate": 6.874677419354839e-05, + "loss": 0.1814, + "step": 19878 + }, + { + "epoch": 0.318064, + "grad_norm": 0.734375, + "learning_rate": 6.874516129032258e-05, + "loss": 0.2046, + "step": 19879 + }, + { + "epoch": 0.31808, + "grad_norm": 1.015625, + "learning_rate": 6.874354838709678e-05, + "loss": 0.1702, + "step": 19880 + }, + { + "epoch": 0.318096, + "grad_norm": 0.80859375, + "learning_rate": 6.874193548387096e-05, + "loss": 0.1613, + "step": 19881 + }, + { + "epoch": 0.318112, + "grad_norm": 0.9296875, + "learning_rate": 6.874032258064516e-05, + "loss": 0.2379, + "step": 19882 + }, + { + "epoch": 0.318128, + "grad_norm": 0.6796875, + "learning_rate": 6.873870967741936e-05, + "loss": 0.1544, + "step": 19883 + }, + { + "epoch": 0.318144, + "grad_norm": 1.015625, + "learning_rate": 6.873709677419356e-05, + "loss": 0.1675, + "step": 19884 + }, + { + "epoch": 0.31816, + "grad_norm": 0.6015625, + "learning_rate": 6.873548387096775e-05, + "loss": 0.1366, + "step": 19885 + }, + { + "epoch": 0.318176, + "grad_norm": 0.66015625, + "learning_rate": 6.873387096774195e-05, + "loss": 0.1302, + "step": 19886 + }, + { + "epoch": 0.318192, + "grad_norm": 0.62890625, + "learning_rate": 6.873225806451613e-05, + "loss": 0.1652, + "step": 19887 + }, + { + "epoch": 0.318208, + "grad_norm": 0.7734375, + "learning_rate": 6.873064516129033e-05, + "loss": 0.1896, + "step": 19888 + }, + { + "epoch": 0.318224, + "grad_norm": 0.75390625, + "learning_rate": 6.872903225806452e-05, + "loss": 0.1709, + "step": 19889 + }, + { + "epoch": 0.31824, + "grad_norm": 0.8046875, + "learning_rate": 6.872741935483872e-05, + "loss": 0.2298, + "step": 19890 + }, + { + "epoch": 0.318256, + "grad_norm": 0.67578125, + "learning_rate": 6.87258064516129e-05, + "loss": 0.1795, + "step": 19891 + }, + { + "epoch": 0.318272, + "grad_norm": 0.828125, + "learning_rate": 6.872419354838709e-05, + "loss": 0.1776, + "step": 19892 + }, + { + "epoch": 0.318288, + "grad_norm": 2.28125, + "learning_rate": 6.872258064516129e-05, + "loss": 0.191, + "step": 19893 + }, + { + "epoch": 0.318304, + "grad_norm": 0.625, + "learning_rate": 6.872096774193549e-05, + "loss": 0.1535, + "step": 19894 + }, + { + "epoch": 0.31832, + "grad_norm": 1.140625, + "learning_rate": 6.871935483870969e-05, + "loss": 0.187, + "step": 19895 + }, + { + "epoch": 0.318336, + "grad_norm": 1.1640625, + "learning_rate": 6.871774193548387e-05, + "loss": 0.2129, + "step": 19896 + }, + { + "epoch": 0.318352, + "grad_norm": 0.7734375, + "learning_rate": 6.871612903225807e-05, + "loss": 0.1642, + "step": 19897 + }, + { + "epoch": 0.318368, + "grad_norm": 0.66015625, + "learning_rate": 6.871451612903226e-05, + "loss": 0.1823, + "step": 19898 + }, + { + "epoch": 0.318384, + "grad_norm": 0.87890625, + "learning_rate": 6.871290322580646e-05, + "loss": 0.2071, + "step": 19899 + }, + { + "epoch": 0.3184, + "grad_norm": 0.765625, + "learning_rate": 6.871129032258065e-05, + "loss": 0.1935, + "step": 19900 + }, + { + "epoch": 0.318416, + "grad_norm": 1.1015625, + "learning_rate": 6.870967741935485e-05, + "loss": 0.1812, + "step": 19901 + }, + { + "epoch": 0.318432, + "grad_norm": 0.7421875, + "learning_rate": 6.870806451612903e-05, + "loss": 0.1987, + "step": 19902 + }, + { + "epoch": 0.318448, + "grad_norm": 0.890625, + "learning_rate": 6.870645161290323e-05, + "loss": 0.1637, + "step": 19903 + }, + { + "epoch": 0.318464, + "grad_norm": 1.046875, + "learning_rate": 6.870483870967742e-05, + "loss": 0.1708, + "step": 19904 + }, + { + "epoch": 0.31848, + "grad_norm": 0.89453125, + "learning_rate": 6.870322580645162e-05, + "loss": 0.1584, + "step": 19905 + }, + { + "epoch": 0.318496, + "grad_norm": 0.7109375, + "learning_rate": 6.87016129032258e-05, + "loss": 0.1629, + "step": 19906 + }, + { + "epoch": 0.318512, + "grad_norm": 0.55078125, + "learning_rate": 6.87e-05, + "loss": 0.17, + "step": 19907 + }, + { + "epoch": 0.318528, + "grad_norm": 0.68359375, + "learning_rate": 6.86983870967742e-05, + "loss": 0.1951, + "step": 19908 + }, + { + "epoch": 0.318544, + "grad_norm": 1.109375, + "learning_rate": 6.869677419354839e-05, + "loss": 0.1815, + "step": 19909 + }, + { + "epoch": 0.31856, + "grad_norm": 1.3125, + "learning_rate": 6.869516129032259e-05, + "loss": 0.1421, + "step": 19910 + }, + { + "epoch": 0.318576, + "grad_norm": 1.109375, + "learning_rate": 6.869354838709677e-05, + "loss": 0.2025, + "step": 19911 + }, + { + "epoch": 0.318592, + "grad_norm": 1.609375, + "learning_rate": 6.869193548387097e-05, + "loss": 0.2062, + "step": 19912 + }, + { + "epoch": 0.318608, + "grad_norm": 0.578125, + "learning_rate": 6.869032258064516e-05, + "loss": 0.1177, + "step": 19913 + }, + { + "epoch": 0.318624, + "grad_norm": 0.8828125, + "learning_rate": 6.868870967741936e-05, + "loss": 0.1683, + "step": 19914 + }, + { + "epoch": 0.31864, + "grad_norm": 0.859375, + "learning_rate": 6.868709677419355e-05, + "loss": 0.1764, + "step": 19915 + }, + { + "epoch": 0.318656, + "grad_norm": 0.68359375, + "learning_rate": 6.868548387096775e-05, + "loss": 0.1449, + "step": 19916 + }, + { + "epoch": 0.318672, + "grad_norm": 0.8203125, + "learning_rate": 6.868387096774193e-05, + "loss": 0.1968, + "step": 19917 + }, + { + "epoch": 0.318688, + "grad_norm": 0.82421875, + "learning_rate": 6.868225806451613e-05, + "loss": 0.1712, + "step": 19918 + }, + { + "epoch": 0.318704, + "grad_norm": 0.6484375, + "learning_rate": 6.868064516129033e-05, + "loss": 0.1757, + "step": 19919 + }, + { + "epoch": 0.31872, + "grad_norm": 0.6640625, + "learning_rate": 6.867903225806453e-05, + "loss": 0.19, + "step": 19920 + }, + { + "epoch": 0.318736, + "grad_norm": 0.71484375, + "learning_rate": 6.867741935483872e-05, + "loss": 0.1682, + "step": 19921 + }, + { + "epoch": 0.318752, + "grad_norm": 0.90234375, + "learning_rate": 6.867580645161292e-05, + "loss": 0.1485, + "step": 19922 + }, + { + "epoch": 0.318768, + "grad_norm": 0.953125, + "learning_rate": 6.86741935483871e-05, + "loss": 0.1609, + "step": 19923 + }, + { + "epoch": 0.318784, + "grad_norm": 0.74609375, + "learning_rate": 6.867258064516129e-05, + "loss": 0.2022, + "step": 19924 + }, + { + "epoch": 0.3188, + "grad_norm": 1.3515625, + "learning_rate": 6.867096774193549e-05, + "loss": 0.1826, + "step": 19925 + }, + { + "epoch": 0.318816, + "grad_norm": 0.66796875, + "learning_rate": 6.866935483870967e-05, + "loss": 0.1691, + "step": 19926 + }, + { + "epoch": 0.318832, + "grad_norm": 1.078125, + "learning_rate": 6.866774193548387e-05, + "loss": 0.2292, + "step": 19927 + }, + { + "epoch": 0.318848, + "grad_norm": 0.73828125, + "learning_rate": 6.866612903225806e-05, + "loss": 0.1619, + "step": 19928 + }, + { + "epoch": 0.318864, + "grad_norm": 0.765625, + "learning_rate": 6.866451612903226e-05, + "loss": 0.161, + "step": 19929 + }, + { + "epoch": 0.31888, + "grad_norm": 0.84765625, + "learning_rate": 6.866290322580645e-05, + "loss": 0.1725, + "step": 19930 + }, + { + "epoch": 0.318896, + "grad_norm": 0.6640625, + "learning_rate": 6.866129032258064e-05, + "loss": 0.1625, + "step": 19931 + }, + { + "epoch": 0.318912, + "grad_norm": 0.76953125, + "learning_rate": 6.865967741935484e-05, + "loss": 0.178, + "step": 19932 + }, + { + "epoch": 0.318928, + "grad_norm": 0.7421875, + "learning_rate": 6.865806451612904e-05, + "loss": 0.1986, + "step": 19933 + }, + { + "epoch": 0.318944, + "grad_norm": 0.90234375, + "learning_rate": 6.865645161290323e-05, + "loss": 0.1357, + "step": 19934 + }, + { + "epoch": 0.31896, + "grad_norm": 0.859375, + "learning_rate": 6.865483870967743e-05, + "loss": 0.1591, + "step": 19935 + }, + { + "epoch": 0.318976, + "grad_norm": 0.49609375, + "learning_rate": 6.865322580645162e-05, + "loss": 0.127, + "step": 19936 + }, + { + "epoch": 0.318992, + "grad_norm": 0.88671875, + "learning_rate": 6.865161290322582e-05, + "loss": 0.2072, + "step": 19937 + }, + { + "epoch": 0.319008, + "grad_norm": 0.80859375, + "learning_rate": 6.865e-05, + "loss": 0.1731, + "step": 19938 + }, + { + "epoch": 0.319024, + "grad_norm": 0.80078125, + "learning_rate": 6.864838709677419e-05, + "loss": 0.1305, + "step": 19939 + }, + { + "epoch": 0.31904, + "grad_norm": 0.515625, + "learning_rate": 6.864677419354839e-05, + "loss": 0.1596, + "step": 19940 + }, + { + "epoch": 0.319056, + "grad_norm": 1.1015625, + "learning_rate": 6.864516129032257e-05, + "loss": 0.1934, + "step": 19941 + }, + { + "epoch": 0.319072, + "grad_norm": 0.71484375, + "learning_rate": 6.864354838709677e-05, + "loss": 0.1672, + "step": 19942 + }, + { + "epoch": 0.319088, + "grad_norm": 0.8203125, + "learning_rate": 6.864193548387097e-05, + "loss": 0.1475, + "step": 19943 + }, + { + "epoch": 0.319104, + "grad_norm": 1.171875, + "learning_rate": 6.864032258064517e-05, + "loss": 0.1828, + "step": 19944 + }, + { + "epoch": 0.31912, + "grad_norm": 0.88671875, + "learning_rate": 6.863870967741936e-05, + "loss": 0.1791, + "step": 19945 + }, + { + "epoch": 0.319136, + "grad_norm": 0.6328125, + "learning_rate": 6.863709677419356e-05, + "loss": 0.1668, + "step": 19946 + }, + { + "epoch": 0.319152, + "grad_norm": 0.80078125, + "learning_rate": 6.863548387096774e-05, + "loss": 0.1541, + "step": 19947 + }, + { + "epoch": 0.319168, + "grad_norm": 0.6484375, + "learning_rate": 6.863387096774194e-05, + "loss": 0.181, + "step": 19948 + }, + { + "epoch": 0.319184, + "grad_norm": 0.66015625, + "learning_rate": 6.863225806451613e-05, + "loss": 0.1461, + "step": 19949 + }, + { + "epoch": 0.3192, + "grad_norm": 1.0546875, + "learning_rate": 6.863064516129033e-05, + "loss": 0.1961, + "step": 19950 + }, + { + "epoch": 0.319216, + "grad_norm": 0.62890625, + "learning_rate": 6.862903225806452e-05, + "loss": 0.1346, + "step": 19951 + }, + { + "epoch": 0.319232, + "grad_norm": 0.94140625, + "learning_rate": 6.862741935483872e-05, + "loss": 0.192, + "step": 19952 + }, + { + "epoch": 0.319248, + "grad_norm": 0.6796875, + "learning_rate": 6.86258064516129e-05, + "loss": 0.1708, + "step": 19953 + }, + { + "epoch": 0.319264, + "grad_norm": 0.7265625, + "learning_rate": 6.86241935483871e-05, + "loss": 0.1698, + "step": 19954 + }, + { + "epoch": 0.31928, + "grad_norm": 0.67578125, + "learning_rate": 6.86225806451613e-05, + "loss": 0.1511, + "step": 19955 + }, + { + "epoch": 0.319296, + "grad_norm": 0.69140625, + "learning_rate": 6.862096774193549e-05, + "loss": 0.1707, + "step": 19956 + }, + { + "epoch": 0.319312, + "grad_norm": 0.7421875, + "learning_rate": 6.861935483870969e-05, + "loss": 0.189, + "step": 19957 + }, + { + "epoch": 0.319328, + "grad_norm": 0.66015625, + "learning_rate": 6.861774193548387e-05, + "loss": 0.157, + "step": 19958 + }, + { + "epoch": 0.319344, + "grad_norm": 0.7421875, + "learning_rate": 6.861612903225807e-05, + "loss": 0.1752, + "step": 19959 + }, + { + "epoch": 0.31936, + "grad_norm": 0.47265625, + "learning_rate": 6.861451612903226e-05, + "loss": 0.1601, + "step": 19960 + }, + { + "epoch": 0.319376, + "grad_norm": 0.78125, + "learning_rate": 6.861290322580646e-05, + "loss": 0.1753, + "step": 19961 + }, + { + "epoch": 0.319392, + "grad_norm": 0.81640625, + "learning_rate": 6.861129032258064e-05, + "loss": 0.1541, + "step": 19962 + }, + { + "epoch": 0.319408, + "grad_norm": 1.0234375, + "learning_rate": 6.860967741935484e-05, + "loss": 0.178, + "step": 19963 + }, + { + "epoch": 0.319424, + "grad_norm": 0.7890625, + "learning_rate": 6.860806451612903e-05, + "loss": 0.1893, + "step": 19964 + }, + { + "epoch": 0.31944, + "grad_norm": 0.6796875, + "learning_rate": 6.860645161290323e-05, + "loss": 0.1717, + "step": 19965 + }, + { + "epoch": 0.319456, + "grad_norm": 1.2578125, + "learning_rate": 6.860483870967742e-05, + "loss": 0.1912, + "step": 19966 + }, + { + "epoch": 0.319472, + "grad_norm": 1.734375, + "learning_rate": 6.860322580645161e-05, + "loss": 0.2287, + "step": 19967 + }, + { + "epoch": 0.319488, + "grad_norm": 0.9375, + "learning_rate": 6.860161290322581e-05, + "loss": 0.1857, + "step": 19968 + }, + { + "epoch": 0.319504, + "grad_norm": 0.76171875, + "learning_rate": 6.860000000000001e-05, + "loss": 0.1788, + "step": 19969 + }, + { + "epoch": 0.31952, + "grad_norm": 0.83203125, + "learning_rate": 6.85983870967742e-05, + "loss": 0.1886, + "step": 19970 + }, + { + "epoch": 0.319536, + "grad_norm": 0.8359375, + "learning_rate": 6.859677419354839e-05, + "loss": 0.1574, + "step": 19971 + }, + { + "epoch": 0.319552, + "grad_norm": 1.1171875, + "learning_rate": 6.859516129032259e-05, + "loss": 0.1957, + "step": 19972 + }, + { + "epoch": 0.319568, + "grad_norm": 0.87109375, + "learning_rate": 6.859354838709677e-05, + "loss": 0.1482, + "step": 19973 + }, + { + "epoch": 0.319584, + "grad_norm": 0.88671875, + "learning_rate": 6.859193548387097e-05, + "loss": 0.1699, + "step": 19974 + }, + { + "epoch": 0.3196, + "grad_norm": 0.7890625, + "learning_rate": 6.859032258064516e-05, + "loss": 0.161, + "step": 19975 + }, + { + "epoch": 0.319616, + "grad_norm": 0.90234375, + "learning_rate": 6.858870967741936e-05, + "loss": 0.2062, + "step": 19976 + }, + { + "epoch": 0.319632, + "grad_norm": 0.8125, + "learning_rate": 6.858709677419354e-05, + "loss": 0.1603, + "step": 19977 + }, + { + "epoch": 0.319648, + "grad_norm": 0.84765625, + "learning_rate": 6.858548387096774e-05, + "loss": 0.189, + "step": 19978 + }, + { + "epoch": 0.319664, + "grad_norm": 0.74609375, + "learning_rate": 6.858387096774194e-05, + "loss": 0.1777, + "step": 19979 + }, + { + "epoch": 0.31968, + "grad_norm": 0.76953125, + "learning_rate": 6.858225806451614e-05, + "loss": 0.1754, + "step": 19980 + }, + { + "epoch": 0.319696, + "grad_norm": 0.5546875, + "learning_rate": 6.858064516129033e-05, + "loss": 0.1428, + "step": 19981 + }, + { + "epoch": 0.319712, + "grad_norm": 0.7265625, + "learning_rate": 6.857903225806453e-05, + "loss": 0.1464, + "step": 19982 + }, + { + "epoch": 0.319728, + "grad_norm": 0.97265625, + "learning_rate": 6.857741935483871e-05, + "loss": 0.1831, + "step": 19983 + }, + { + "epoch": 0.319744, + "grad_norm": 0.6640625, + "learning_rate": 6.857580645161291e-05, + "loss": 0.1583, + "step": 19984 + }, + { + "epoch": 0.31976, + "grad_norm": 1.0546875, + "learning_rate": 6.85741935483871e-05, + "loss": 0.1688, + "step": 19985 + }, + { + "epoch": 0.319776, + "grad_norm": 0.87109375, + "learning_rate": 6.857258064516129e-05, + "loss": 0.1823, + "step": 19986 + }, + { + "epoch": 0.319792, + "grad_norm": 0.71875, + "learning_rate": 6.857096774193549e-05, + "loss": 0.1429, + "step": 19987 + }, + { + "epoch": 0.319808, + "grad_norm": 1.53125, + "learning_rate": 6.856935483870967e-05, + "loss": 0.2029, + "step": 19988 + }, + { + "epoch": 0.319824, + "grad_norm": 0.6484375, + "learning_rate": 6.856774193548387e-05, + "loss": 0.1644, + "step": 19989 + }, + { + "epoch": 0.31984, + "grad_norm": 0.447265625, + "learning_rate": 6.856612903225807e-05, + "loss": 0.139, + "step": 19990 + }, + { + "epoch": 0.319856, + "grad_norm": 0.71875, + "learning_rate": 6.856451612903226e-05, + "loss": 0.1537, + "step": 19991 + }, + { + "epoch": 0.319872, + "grad_norm": 0.62890625, + "learning_rate": 6.856290322580646e-05, + "loss": 0.1444, + "step": 19992 + }, + { + "epoch": 0.319888, + "grad_norm": 0.9296875, + "learning_rate": 6.856129032258066e-05, + "loss": 0.1697, + "step": 19993 + }, + { + "epoch": 0.319904, + "grad_norm": 1.125, + "learning_rate": 6.855967741935484e-05, + "loss": 0.1623, + "step": 19994 + }, + { + "epoch": 0.31992, + "grad_norm": 0.6640625, + "learning_rate": 6.855806451612904e-05, + "loss": 0.1563, + "step": 19995 + }, + { + "epoch": 0.319936, + "grad_norm": 0.7421875, + "learning_rate": 6.855645161290323e-05, + "loss": 0.1543, + "step": 19996 + }, + { + "epoch": 0.319952, + "grad_norm": 0.7578125, + "learning_rate": 6.855483870967743e-05, + "loss": 0.1597, + "step": 19997 + }, + { + "epoch": 0.319968, + "grad_norm": 1.328125, + "learning_rate": 6.855322580645161e-05, + "loss": 0.1128, + "step": 19998 + }, + { + "epoch": 0.319984, + "grad_norm": 1.265625, + "learning_rate": 6.855161290322581e-05, + "loss": 0.215, + "step": 19999 + }, + { + "epoch": 0.32, + "grad_norm": 0.60546875, + "learning_rate": 6.855e-05, + "loss": 0.148, + "step": 20000 + } + ], + "logging_steps": 1, + "max_steps": 62500, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.492612561421189e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}