{ "runs": [ { "run_name": "pico-decoder-tiny-dolma29k-v2", "log_file": "log_20250829_003838.log", "training_metrics": [ { "step": 0, "loss": 10.9848, "learning_rate": 0.0, "inf_nan_count": 0 }, { "step": 50, "loss": 11.0005, "learning_rate": 1e-06, "inf_nan_count": 0 }, { "step": 100, "loss": 10.9918, "learning_rate": 2e-06, "inf_nan_count": 0 }, { "step": 150, "loss": 10.9776, "learning_rate": 3e-06, "inf_nan_count": 0 }, { "step": 200, "loss": 10.9569, "learning_rate": 4e-06, "inf_nan_count": 0 }, { "step": 250, "loss": 10.9255, "learning_rate": 5e-06, "inf_nan_count": 0 }, { "step": 300, "loss": 10.8883, "learning_rate": 6e-06, "inf_nan_count": 0 }, { "step": 350, "loss": 10.8249, "learning_rate": 7e-06, "inf_nan_count": 0 }, { "step": 400, "loss": 10.7344, "learning_rate": 8e-06, "inf_nan_count": 0 }, { "step": 450, "loss": 10.6177, "learning_rate": 9e-06, "inf_nan_count": 0 }, { "step": 500, "loss": 10.5025, "learning_rate": 1e-05, "inf_nan_count": 0 }, { "step": 550, "loss": 10.3986, "learning_rate": 1.1e-05, "inf_nan_count": 0 }, { "step": 600, "loss": 10.3079, "learning_rate": 1.2e-05, "inf_nan_count": 0 }, { "step": 650, "loss": 10.2142, "learning_rate": 1.3e-05, "inf_nan_count": 0 }, { "step": 700, "loss": 10.1146, "learning_rate": 1.4e-05, "inf_nan_count": 0 }, { "step": 750, "loss": 10.0398, "learning_rate": 1.5e-05, "inf_nan_count": 0 }, { "step": 800, "loss": 9.9311, "learning_rate": 1.6e-05, "inf_nan_count": 0 }, { "step": 850, "loss": 9.8431, "learning_rate": 1.7e-05, "inf_nan_count": 0 }, { "step": 900, "loss": 9.7453, "learning_rate": 1.8e-05, "inf_nan_count": 0 }, { "step": 950, "loss": 9.6527, "learning_rate": 1.9e-05, "inf_nan_count": 0 }, { "step": 1000, "loss": 9.5691, "learning_rate": 2e-05, "inf_nan_count": 0 }, { "step": 1050, "loss": 9.46, "learning_rate": 2.1e-05, "inf_nan_count": 0 }, { "step": 1100, "loss": 9.3525, "learning_rate": 2.2e-05, "inf_nan_count": 0 }, { "step": 1150, "loss": 9.2715, "learning_rate": 2.3e-05, "inf_nan_count": 0 }, { "step": 1200, "loss": 9.1618, "learning_rate": 2.4e-05, "inf_nan_count": 0 }, { "step": 1250, "loss": 9.0547, "learning_rate": 2.5e-05, "inf_nan_count": 0 }, { "step": 1300, "loss": 8.955, "learning_rate": 2.6e-05, "inf_nan_count": 0 }, { "step": 1350, "loss": 8.8251, "learning_rate": 2.7e-05, "inf_nan_count": 0 }, { "step": 1400, "loss": 8.7711, "learning_rate": 2.8e-05, "inf_nan_count": 0 }, { "step": 1450, "loss": 8.6834, "learning_rate": 2.9e-05, "inf_nan_count": 0 }, { "step": 1500, "loss": 8.5638, "learning_rate": 3e-05, "inf_nan_count": 0 }, { "step": 1550, "loss": 8.4572, "learning_rate": 3.1e-05, "inf_nan_count": 0 }, { "step": 1600, "loss": 8.394, "learning_rate": 3.2e-05, "inf_nan_count": 0 }, { "step": 1650, "loss": 8.2973, "learning_rate": 3.3e-05, "inf_nan_count": 0 }, { "step": 1700, "loss": 8.2264, "learning_rate": 3.4e-05, "inf_nan_count": 0 }, { "step": 1750, "loss": 8.1672, "learning_rate": 3.5e-05, "inf_nan_count": 0 }, { "step": 1800, "loss": 8.0695, "learning_rate": 3.6e-05, "inf_nan_count": 0 }, { "step": 1850, "loss": 8.0299, "learning_rate": 3.7e-05, "inf_nan_count": 0 }, { "step": 1900, "loss": 7.9883, "learning_rate": 3.8e-05, "inf_nan_count": 0 }, { "step": 1950, "loss": 7.9429, "learning_rate": 3.9e-05, "inf_nan_count": 0 }, { "step": 2000, "loss": 7.8447, "learning_rate": 4e-05, "inf_nan_count": 0 }, { "step": 2050, "loss": 7.838, "learning_rate": 4.1e-05, "inf_nan_count": 0 }, { "step": 2100, "loss": 7.7671, "learning_rate": 4.2e-05, "inf_nan_count": 0 }, { "step": 2150, "loss": 7.7637, "learning_rate": 4.3e-05, "inf_nan_count": 0 }, { "step": 2200, "loss": 7.706, "learning_rate": 4.4e-05, "inf_nan_count": 0 }, { "step": 2250, "loss": 7.7607, "learning_rate": 4.5e-05, "inf_nan_count": 0 }, { "step": 2300, "loss": 7.7076, "learning_rate": 4.6e-05, "inf_nan_count": 0 }, { "step": 2350, "loss": 7.6787, "learning_rate": 4.7e-05, "inf_nan_count": 0 }, { "step": 2400, "loss": 7.6446, "learning_rate": 4.8e-05, "inf_nan_count": 0 }, { "step": 2450, "loss": 7.5999, "learning_rate": 4.9e-05, "inf_nan_count": 0 }, { "step": 2500, "loss": 7.6154, "learning_rate": 5e-05, "inf_nan_count": 0 }, { "step": 2550, "loss": 7.5627, "learning_rate": 5.1e-05, "inf_nan_count": 0 }, { "step": 2600, "loss": 7.5747, "learning_rate": 5.2e-05, "inf_nan_count": 0 }, { "step": 2650, "loss": 7.5358, "learning_rate": 5.3e-05, "inf_nan_count": 0 }, { "step": 2700, "loss": 7.5148, "learning_rate": 5.4e-05, "inf_nan_count": 0 }, { "step": 2750, "loss": 7.4874, "learning_rate": 5.5e-05, "inf_nan_count": 0 }, { "step": 2800, "loss": 7.4438, "learning_rate": 5.6e-05, "inf_nan_count": 0 }, { "step": 2850, "loss": 7.4772, "learning_rate": 5.7e-05, "inf_nan_count": 0 }, { "step": 2900, "loss": 7.4135, "learning_rate": 5.8e-05, "inf_nan_count": 0 }, { "step": 2950, "loss": 7.3929, "learning_rate": 5.9e-05, "inf_nan_count": 0 }, { "step": 3000, "loss": 7.3566, "learning_rate": 6e-05, "inf_nan_count": 0 }, { "step": 3050, "loss": 7.3318, "learning_rate": 6.1e-05, "inf_nan_count": 0 }, { "step": 3100, "loss": 7.3114, "learning_rate": 6.2e-05, "inf_nan_count": 0 }, { "step": 3150, "loss": 7.2734, "learning_rate": 6.3e-05, "inf_nan_count": 0 }, { "step": 3200, "loss": 7.322, "learning_rate": 6.4e-05, "inf_nan_count": 0 }, { "step": 3250, "loss": 7.2621, "learning_rate": 6.5e-05, "inf_nan_count": 0 }, { "step": 3300, "loss": 7.2257, "learning_rate": 6.6e-05, "inf_nan_count": 0 }, { "step": 3350, "loss": 7.2447, "learning_rate": 6.7e-05, "inf_nan_count": 0 }, { "step": 3400, "loss": 7.2344, "learning_rate": 6.8e-05, "inf_nan_count": 0 }, { "step": 3450, "loss": 7.1488, "learning_rate": 6.9e-05, "inf_nan_count": 0 }, { "step": 3500, "loss": 7.1797, "learning_rate": 7e-05, "inf_nan_count": 0 }, { "step": 3550, "loss": 7.1737, "learning_rate": 7.1e-05, "inf_nan_count": 0 }, { "step": 3600, "loss": 7.1204, "learning_rate": 7.2e-05, "inf_nan_count": 0 }, { "step": 3650, "loss": 7.1102, "learning_rate": 7.3e-05, "inf_nan_count": 0 }, { "step": 3700, "loss": 7.0845, "learning_rate": 7.4e-05, "inf_nan_count": 0 }, { "step": 3750, "loss": 7.0858, "learning_rate": 7.5e-05, "inf_nan_count": 0 }, { "step": 3800, "loss": 7.0362, "learning_rate": 7.6e-05, "inf_nan_count": 0 }, { "step": 3850, "loss": 7.0603, "learning_rate": 7.7e-05, "inf_nan_count": 0 }, { "step": 3900, "loss": 7.0172, "learning_rate": 7.8e-05, "inf_nan_count": 0 }, { "step": 3950, "loss": 6.9948, "learning_rate": 7.9e-05, "inf_nan_count": 0 }, { "step": 4000, "loss": 6.9909, "learning_rate": 8e-05, "inf_nan_count": 0 }, { "step": 4050, "loss": 6.9477, "learning_rate": 8.1e-05, "inf_nan_count": 0 }, { "step": 4100, "loss": 6.9651, "learning_rate": 8.2e-05, "inf_nan_count": 0 }, { "step": 4150, "loss": 6.9149, "learning_rate": 8.3e-05, "inf_nan_count": 0 }, { "step": 4200, "loss": 6.893, "learning_rate": 8.4e-05, "inf_nan_count": 0 }, { "step": 4250, "loss": 6.9227, "learning_rate": 8.5e-05, "inf_nan_count": 0 }, { "step": 4300, "loss": 6.879, "learning_rate": 8.6e-05, "inf_nan_count": 0 }, { "step": 4350, "loss": 6.8649, "learning_rate": 8.7e-05, "inf_nan_count": 0 }, { "step": 4400, "loss": 6.8305, "learning_rate": 8.8e-05, "inf_nan_count": 0 }, { "step": 4450, "loss": 6.8085, "learning_rate": 8.9e-05, "inf_nan_count": 0 }, { "step": 4500, "loss": 6.8315, "learning_rate": 9e-05, "inf_nan_count": 0 }, { "step": 4550, "loss": 6.7885, "learning_rate": 9.1e-05, "inf_nan_count": 0 }, { "step": 4600, "loss": 6.7805, "learning_rate": 9.2e-05, "inf_nan_count": 0 }, { "step": 4650, "loss": 6.7737, "learning_rate": 9.3e-05, "inf_nan_count": 0 }, { "step": 4700, "loss": 6.7649, "learning_rate": 9.4e-05, "inf_nan_count": 0 }, { "step": 4750, "loss": 6.7562, "learning_rate": 9.5e-05, "inf_nan_count": 0 }, { "step": 4800, "loss": 6.7347, "learning_rate": 9.6e-05, "inf_nan_count": 0 }, { "step": 4850, "loss": 6.7161, "learning_rate": 9.7e-05, "inf_nan_count": 0 }, { "step": 4900, "loss": 6.6889, "learning_rate": 9.8e-05, "inf_nan_count": 0 }, { "step": 4950, "loss": 6.7299, "learning_rate": 9.9e-05, "inf_nan_count": 0 }, { "step": 5000, "loss": 6.6605, "learning_rate": 0.0001, "inf_nan_count": 0 }, { "step": 5050, "loss": 6.6552, "learning_rate": 0.0001, "inf_nan_count": 0 }, { "step": 5100, "loss": 6.7038, "learning_rate": 9.99e-05, "inf_nan_count": 0 }, { "step": 5150, "loss": 6.6452, "learning_rate": 9.99e-05, "inf_nan_count": 0 }, { "step": 5200, "loss": 6.6522, "learning_rate": 9.99e-05, "inf_nan_count": 0 }, { "step": 5250, "loss": 6.627, "learning_rate": 9.99e-05, "inf_nan_count": 0 }, { "step": 5300, "loss": 6.5733, "learning_rate": 9.98e-05, "inf_nan_count": 0 }, { "step": 5350, "loss": 6.5833, "learning_rate": 9.98e-05, "inf_nan_count": 0 }, { "step": 5400, "loss": 6.5854, "learning_rate": 9.98e-05, "inf_nan_count": 0 }, { "step": 5450, "loss": 6.6012, "learning_rate": 9.98e-05, "inf_nan_count": 0 }, { "step": 5500, "loss": 6.5786, "learning_rate": 9.97e-05, "inf_nan_count": 0 } ], "evaluation_results": [ { "step": 1000, "paloma": 5.073320568651489e+18 }, { "step": 2000, "paloma": 1.8978577072995303e+19 }, { "step": 3000, "paloma": 3.1701596694317715e+19 }, { "step": 4000, "paloma": 2.5015965971757485e+20 }, { "step": 5000, "paloma": 2.38712860824014e+21 } ], "config": { "d_model": 96, "n_layers": 12, "max_seq_len": 2048, "vocab_size": 50304, "lr": 0.0001, "max_steps": 200000, "batch_size": 1 } }, { "run_name": "pico-decoder-tiny-dolma29k-v3", "log_file": "log_20250829_020629.log", "training_metrics": [ { "step": 500, "loss": 10.8854, "learning_rate": 3.13e-06, "inf_nan_count": 0 }, { "step": 525, "loss": 10.889, "learning_rate": 3.28e-06, "inf_nan_count": 0 }, { "step": 550, "loss": 10.8846, "learning_rate": 3.44e-06, "inf_nan_count": 0 }, { "step": 575, "loss": 10.8657, "learning_rate": 3.59e-06, "inf_nan_count": 0 }, { "step": 600, "loss": 10.859, "learning_rate": 3.75e-06, "inf_nan_count": 0 }, { "step": 625, "loss": 10.8328, "learning_rate": 3.91e-06, "inf_nan_count": 0 }, { "step": 650, "loss": 10.8166, "learning_rate": 4.06e-06, "inf_nan_count": 0 }, { "step": 675, "loss": 10.7913, "learning_rate": 4.22e-06, "inf_nan_count": 0 }, { "step": 700, "loss": 10.7609, "learning_rate": 4.37e-06, "inf_nan_count": 0 }, { "step": 725, "loss": 10.7322, "learning_rate": 4.53e-06, "inf_nan_count": 0 }, { "step": 750, "loss": 10.7121, "learning_rate": 4.69e-06, "inf_nan_count": 0 }, { "step": 775, "loss": 10.6877, "learning_rate": 4.84e-06, "inf_nan_count": 0 }, { "step": 800, "loss": 10.6436, "learning_rate": 5e-06, "inf_nan_count": 0 }, { "step": 825, "loss": 10.6256, "learning_rate": 5.16e-06, "inf_nan_count": 0 }, { "step": 850, "loss": 10.5961, "learning_rate": 5.31e-06, "inf_nan_count": 0 }, { "step": 875, "loss": 10.5443, "learning_rate": 5.47e-06, "inf_nan_count": 0 }, { "step": 900, "loss": 10.5197, "learning_rate": 5.63e-06, "inf_nan_count": 0 }, { "step": 925, "loss": 10.4854, "learning_rate": 5.78e-06, "inf_nan_count": 0 }, { "step": 950, "loss": 10.4826, "learning_rate": 5.94e-06, "inf_nan_count": 0 }, { "step": 975, "loss": 10.4557, "learning_rate": 6.09e-06, "inf_nan_count": 0 }, { "step": 1000, "loss": 10.4142, "learning_rate": 6.25e-06, "inf_nan_count": 0 }, { "step": 1025, "loss": 10.3885, "learning_rate": 6.41e-06, "inf_nan_count": 0 }, { "step": 1050, "loss": 10.3737, "learning_rate": 6.56e-06, "inf_nan_count": 0 }, { "step": 1075, "loss": 10.3534, "learning_rate": 6.72e-06, "inf_nan_count": 0 }, { "step": 1100, "loss": 10.3219, "learning_rate": 6.88e-06, "inf_nan_count": 0 }, { "step": 1125, "loss": 10.3064, "learning_rate": 7.03e-06, "inf_nan_count": 0 }, { "step": 1150, "loss": 10.2761, "learning_rate": 7.19e-06, "inf_nan_count": 0 }, { "step": 1175, "loss": 10.2592, "learning_rate": 7.34e-06, "inf_nan_count": 0 }, { "step": 1200, "loss": 10.242, "learning_rate": 7.5e-06, "inf_nan_count": 0 }, { "step": 1225, "loss": 10.2141, "learning_rate": 7.66e-06, "inf_nan_count": 0 }, { "step": 1250, "loss": 10.1882, "learning_rate": 7.81e-06, "inf_nan_count": 0 }, { "step": 1275, "loss": 10.1608, "learning_rate": 7.97e-06, "inf_nan_count": 0 }, { "step": 1300, "loss": 10.146, "learning_rate": 8.13e-06, "inf_nan_count": 0 }, { "step": 1325, "loss": 10.0944, "learning_rate": 8.28e-06, "inf_nan_count": 0 }, { "step": 1350, "loss": 10.0885, "learning_rate": 8.44e-06, "inf_nan_count": 0 }, { "step": 1375, "loss": 10.0748, "learning_rate": 8.59e-06, "inf_nan_count": 0 }, { "step": 1400, "loss": 10.0425, "learning_rate": 8.75e-06, "inf_nan_count": 0 }, { "step": 1425, "loss": 10.0422, "learning_rate": 8.91e-06, "inf_nan_count": 0 }, { "step": 1450, "loss": 10.0039, "learning_rate": 9.06e-06, "inf_nan_count": 0 }, { "step": 1475, "loss": 9.9736, "learning_rate": 9.22e-06, "inf_nan_count": 0 }, { "step": 1500, "loss": 9.9729, "learning_rate": 9.38e-06, "inf_nan_count": 0 }, { "step": 1525, "loss": 9.9379, "learning_rate": 9.53e-06, "inf_nan_count": 0 }, { "step": 1550, "loss": 9.8819, "learning_rate": 9.69e-06, "inf_nan_count": 0 }, { "step": 1575, "loss": 9.8702, "learning_rate": 9.84e-06, "inf_nan_count": 0 }, { "step": 1600, "loss": 9.8571, "learning_rate": 1e-05, "inf_nan_count": 0 }, { "step": 1625, "loss": 9.8356, "learning_rate": 1.02e-05, "inf_nan_count": 0 }, { "step": 1650, "loss": 9.7973, "learning_rate": 1.03e-05, "inf_nan_count": 0 }, { "step": 1675, "loss": 9.7745, "learning_rate": 1.05e-05, "inf_nan_count": 0 }, { "step": 1700, "loss": 9.7673, "learning_rate": 1.06e-05, "inf_nan_count": 0 }, { "step": 1725, "loss": 9.7406, "learning_rate": 1.08e-05, "inf_nan_count": 0 }, { "step": 1750, "loss": 9.7312, "learning_rate": 1.09e-05, "inf_nan_count": 0 }, { "step": 1775, "loss": 9.6563, "learning_rate": 1.11e-05, "inf_nan_count": 0 }, { "step": 1800, "loss": 9.6515, "learning_rate": 1.13e-05, "inf_nan_count": 0 }, { "step": 1825, "loss": 9.6241, "learning_rate": 1.14e-05, "inf_nan_count": 0 }, { "step": 1850, "loss": 9.6015, "learning_rate": 1.16e-05, "inf_nan_count": 0 }, { "step": 1875, "loss": 9.5933, "learning_rate": 1.17e-05, "inf_nan_count": 0 }, { "step": 1900, "loss": 9.5544, "learning_rate": 1.19e-05, "inf_nan_count": 0 }, { "step": 1925, "loss": 9.5407, "learning_rate": 1.2e-05, "inf_nan_count": 0 }, { "step": 1950, "loss": 9.5431, "learning_rate": 1.22e-05, "inf_nan_count": 0 }, { "step": 1975, "loss": 9.4853, "learning_rate": 1.23e-05, "inf_nan_count": 0 }, { "step": 2000, "loss": 9.4665, "learning_rate": 1.25e-05, "inf_nan_count": 0 }, { "step": 2025, "loss": 9.4621, "learning_rate": 1.27e-05, "inf_nan_count": 0 }, { "step": 2050, "loss": 9.4031, "learning_rate": 1.28e-05, "inf_nan_count": 0 }, { "step": 2075, "loss": 9.3699, "learning_rate": 1.3e-05, "inf_nan_count": 0 }, { "step": 2100, "loss": 9.3422, "learning_rate": 1.31e-05, "inf_nan_count": 0 }, { "step": 2125, "loss": 9.3129, "learning_rate": 1.33e-05, "inf_nan_count": 0 }, { "step": 2150, "loss": 9.2917, "learning_rate": 1.34e-05, "inf_nan_count": 0 }, { "step": 2175, "loss": 9.267, "learning_rate": 1.36e-05, "inf_nan_count": 0 }, { "step": 2200, "loss": 9.2512, "learning_rate": 1.38e-05, "inf_nan_count": 0 }, { "step": 2225, "loss": 9.2737, "learning_rate": 1.39e-05, "inf_nan_count": 0 }, { "step": 2250, "loss": 9.2357, "learning_rate": 1.41e-05, "inf_nan_count": 0 }, { "step": 2275, "loss": 9.1471, "learning_rate": 1.42e-05, "inf_nan_count": 0 }, { "step": 2300, "loss": 9.1305, "learning_rate": 1.44e-05, "inf_nan_count": 0 }, { "step": 2325, "loss": 9.143, "learning_rate": 1.45e-05, "inf_nan_count": 0 }, { "step": 2350, "loss": 9.0948, "learning_rate": 1.47e-05, "inf_nan_count": 0 }, { "step": 2375, "loss": 9.0256, "learning_rate": 1.48e-05, "inf_nan_count": 0 }, { "step": 2400, "loss": 9.0664, "learning_rate": 1.5e-05, "inf_nan_count": 0 }, { "step": 2425, "loss": 9.002, "learning_rate": 1.52e-05, "inf_nan_count": 0 }, { "step": 2450, "loss": 8.9518, "learning_rate": 1.53e-05, "inf_nan_count": 0 }, { "step": 2475, "loss": 8.9717, "learning_rate": 1.55e-05, "inf_nan_count": 0 }, { "step": 2500, "loss": 8.9536, "learning_rate": 1.56e-05, "inf_nan_count": 0 }, { "step": 2525, "loss": 8.8812, "learning_rate": 1.58e-05, "inf_nan_count": 0 }, { "step": 2550, "loss": 8.8824, "learning_rate": 1.59e-05, "inf_nan_count": 0 }, { "step": 2575, "loss": 8.8564, "learning_rate": 1.61e-05, "inf_nan_count": 0 }, { "step": 2600, "loss": 8.8419, "learning_rate": 1.63e-05, "inf_nan_count": 0 }, { "step": 2625, "loss": 8.7865, "learning_rate": 1.64e-05, "inf_nan_count": 0 }, { "step": 2650, "loss": 8.7493, "learning_rate": 1.66e-05, "inf_nan_count": 0 }, { "step": 2675, "loss": 8.7255, "learning_rate": 1.67e-05, "inf_nan_count": 0 }, { "step": 2700, "loss": 8.6469, "learning_rate": 1.69e-05, "inf_nan_count": 0 }, { "step": 2725, "loss": 8.6799, "learning_rate": 1.7e-05, "inf_nan_count": 0 }, { "step": 2750, "loss": 8.6974, "learning_rate": 1.72e-05, "inf_nan_count": 0 }, { "step": 2775, "loss": 8.6441, "learning_rate": 1.73e-05, "inf_nan_count": 0 }, { "step": 2800, "loss": 8.6689, "learning_rate": 1.75e-05, "inf_nan_count": 0 }, { "step": 2825, "loss": 8.5732, "learning_rate": 1.77e-05, "inf_nan_count": 0 }, { "step": 2850, "loss": 8.5955, "learning_rate": 1.78e-05, "inf_nan_count": 0 }, { "step": 2875, "loss": 8.5823, "learning_rate": 1.8e-05, "inf_nan_count": 0 }, { "step": 2900, "loss": 8.5968, "learning_rate": 1.81e-05, "inf_nan_count": 0 }, { "step": 2925, "loss": 8.4721, "learning_rate": 1.83e-05, "inf_nan_count": 0 }, { "step": 2950, "loss": 8.4672, "learning_rate": 1.84e-05, "inf_nan_count": 0 }, { "step": 2975, "loss": 8.4033, "learning_rate": 1.86e-05, "inf_nan_count": 0 }, { "step": 3000, "loss": 8.4947, "learning_rate": 1.88e-05, "inf_nan_count": 0 }, { "step": 3025, "loss": 8.378, "learning_rate": 1.89e-05, "inf_nan_count": 0 }, { "step": 3050, "loss": 8.3581, "learning_rate": 1.91e-05, "inf_nan_count": 0 }, { "step": 3075, "loss": 8.3341, "learning_rate": 1.92e-05, "inf_nan_count": 0 }, { "step": 3100, "loss": 8.3391, "learning_rate": 1.94e-05, "inf_nan_count": 0 }, { "step": 3125, "loss": 8.367, "learning_rate": 1.95e-05, "inf_nan_count": 0 }, { "step": 3150, "loss": 8.237, "learning_rate": 1.97e-05, "inf_nan_count": 0 }, { "step": 3175, "loss": 8.2879, "learning_rate": 1.98e-05, "inf_nan_count": 0 }, { "step": 3200, "loss": 8.2706, "learning_rate": 2e-05, "inf_nan_count": 0 }, { "step": 3225, "loss": 8.1983, "learning_rate": 2.02e-05, "inf_nan_count": 0 }, { "step": 3250, "loss": 8.2174, "learning_rate": 2.03e-05, "inf_nan_count": 0 }, { "step": 3275, "loss": 8.2229, "learning_rate": 2.05e-05, "inf_nan_count": 0 }, { "step": 3300, "loss": 8.1398, "learning_rate": 2.06e-05, "inf_nan_count": 0 }, { "step": 3325, "loss": 8.143, "learning_rate": 2.08e-05, "inf_nan_count": 0 }, { "step": 3350, "loss": 8.1471, "learning_rate": 2.09e-05, "inf_nan_count": 0 }, { "step": 3375, "loss": 8.0908, "learning_rate": 2.11e-05, "inf_nan_count": 0 }, { "step": 3400, "loss": 8.1165, "learning_rate": 2.13e-05, "inf_nan_count": 0 }, { "step": 3425, "loss": 8.0957, "learning_rate": 2.14e-05, "inf_nan_count": 0 }, { "step": 3450, "loss": 8.1115, "learning_rate": 2.16e-05, "inf_nan_count": 0 }, { "step": 3475, "loss": 8.0623, "learning_rate": 2.17e-05, "inf_nan_count": 0 }, { "step": 3500, "loss": 8.0527, "learning_rate": 2.19e-05, "inf_nan_count": 0 }, { "step": 3525, "loss": 7.9975, "learning_rate": 2.2e-05, "inf_nan_count": 0 }, { "step": 3550, "loss": 7.9881, "learning_rate": 2.22e-05, "inf_nan_count": 0 }, { "step": 3575, "loss": 8.006, "learning_rate": 2.23e-05, "inf_nan_count": 0 }, { "step": 3600, "loss": 7.9366, "learning_rate": 2.25e-05, "inf_nan_count": 0 }, { "step": 3625, "loss": 8.0252, "learning_rate": 2.27e-05, "inf_nan_count": 0 }, { "step": 3650, "loss": 7.916, "learning_rate": 2.28e-05, "inf_nan_count": 0 }, { "step": 3675, "loss": 7.947, "learning_rate": 2.3e-05, "inf_nan_count": 0 }, { "step": 3700, "loss": 7.8943, "learning_rate": 2.31e-05, "inf_nan_count": 0 }, { "step": 3725, "loss": 7.8951, "learning_rate": 2.33e-05, "inf_nan_count": 0 }, { "step": 3750, "loss": 7.9316, "learning_rate": 2.34e-05, "inf_nan_count": 0 }, { "step": 3775, "loss": 7.9407, "learning_rate": 2.36e-05, "inf_nan_count": 0 }, { "step": 3800, "loss": 7.9385, "learning_rate": 2.38e-05, "inf_nan_count": 0 }, { "step": 3825, "loss": 7.88, "learning_rate": 2.39e-05, "inf_nan_count": 0 }, { "step": 3850, "loss": 7.9207, "learning_rate": 2.41e-05, "inf_nan_count": 0 }, { "step": 3875, "loss": 7.8258, "learning_rate": 2.42e-05, "inf_nan_count": 0 }, { "step": 3900, "loss": 7.9005, "learning_rate": 2.44e-05, "inf_nan_count": 0 }, { "step": 3925, "loss": 7.8232, "learning_rate": 2.45e-05, "inf_nan_count": 0 }, { "step": 3950, "loss": 7.7847, "learning_rate": 2.47e-05, "inf_nan_count": 0 }, { "step": 3975, "loss": 7.7909, "learning_rate": 2.48e-05, "inf_nan_count": 0 }, { "step": 4000, "loss": 7.7419, "learning_rate": 2.5e-05, "inf_nan_count": 0 }, { "step": 4025, "loss": 7.8031, "learning_rate": 2.52e-05, "inf_nan_count": 0 }, { "step": 4050, "loss": 7.7948, "learning_rate": 2.53e-05, "inf_nan_count": 0 }, { "step": 4075, "loss": 7.7259, "learning_rate": 2.55e-05, "inf_nan_count": 0 }, { "step": 4100, "loss": 7.8406, "learning_rate": 2.56e-05, "inf_nan_count": 0 }, { "step": 4125, "loss": 7.7938, "learning_rate": 2.58e-05, "inf_nan_count": 0 }, { "step": 4150, "loss": 7.7101, "learning_rate": 2.59e-05, "inf_nan_count": 0 }, { "step": 4175, "loss": 7.6633, "learning_rate": 2.61e-05, "inf_nan_count": 0 }, { "step": 4200, "loss": 7.683, "learning_rate": 2.63e-05, "inf_nan_count": 0 }, { "step": 4225, "loss": 7.7106, "learning_rate": 2.64e-05, "inf_nan_count": 0 }, { "step": 4250, "loss": 7.7174, "learning_rate": 2.66e-05, "inf_nan_count": 0 }, { "step": 4275, "loss": 7.7508, "learning_rate": 2.67e-05, "inf_nan_count": 0 }, { "step": 4300, "loss": 7.6831, "learning_rate": 2.69e-05, "inf_nan_count": 0 }, { "step": 4325, "loss": 7.6498, "learning_rate": 2.7e-05, "inf_nan_count": 0 }, { "step": 4350, "loss": 7.6668, "learning_rate": 2.72e-05, "inf_nan_count": 0 }, { "step": 4375, "loss": 7.6852, "learning_rate": 2.73e-05, "inf_nan_count": 0 }, { "step": 4400, "loss": 7.6469, "learning_rate": 2.75e-05, "inf_nan_count": 0 }, { "step": 4425, "loss": 7.7448, "learning_rate": 2.77e-05, "inf_nan_count": 0 }, { "step": 4450, "loss": 7.7422, "learning_rate": 2.78e-05, "inf_nan_count": 0 }, { "step": 4475, "loss": 7.6918, "learning_rate": 2.8e-05, "inf_nan_count": 0 }, { "step": 4500, "loss": 7.7084, "learning_rate": 2.81e-05, "inf_nan_count": 0 }, { "step": 4525, "loss": 7.722, "learning_rate": 2.83e-05, "inf_nan_count": 0 }, { "step": 4550, "loss": 7.6893, "learning_rate": 2.84e-05, "inf_nan_count": 0 }, { "step": 4575, "loss": 7.6454, "learning_rate": 2.86e-05, "inf_nan_count": 0 }, { "step": 4600, "loss": 7.6298, "learning_rate": 2.87e-05, "inf_nan_count": 0 }, { "step": 4625, "loss": 7.642, "learning_rate": 2.89e-05, "inf_nan_count": 0 }, { "step": 4650, "loss": 7.6247, "learning_rate": 2.91e-05, "inf_nan_count": 0 }, { "step": 4675, "loss": 7.6448, "learning_rate": 2.92e-05, "inf_nan_count": 0 }, { "step": 4700, "loss": 7.6506, "learning_rate": 2.94e-05, "inf_nan_count": 0 }, { "step": 4725, "loss": 7.6356, "learning_rate": 2.95e-05, "inf_nan_count": 0 }, { "step": 4750, "loss": 7.6426, "learning_rate": 2.97e-05, "inf_nan_count": 0 }, { "step": 4775, "loss": 7.6388, "learning_rate": 2.98e-05, "inf_nan_count": 0 }, { "step": 4800, "loss": 7.5216, "learning_rate": 3e-05, "inf_nan_count": 0 }, { "step": 4825, "loss": 7.5367, "learning_rate": 3.02e-05, "inf_nan_count": 0 }, { "step": 4850, "loss": 7.5084, "learning_rate": 3.03e-05, "inf_nan_count": 0 }, { "step": 4875, "loss": 7.6092, "learning_rate": 3.05e-05, "inf_nan_count": 0 }, { "step": 4900, "loss": 7.576, "learning_rate": 3.06e-05, "inf_nan_count": 0 }, { "step": 4925, "loss": 7.5686, "learning_rate": 3.08e-05, "inf_nan_count": 0 }, { "step": 4950, "loss": 7.5583, "learning_rate": 3.09e-05, "inf_nan_count": 0 }, { "step": 4975, "loss": 7.5818, "learning_rate": 3.11e-05, "inf_nan_count": 0 }, { "step": 5000, "loss": 7.6004, "learning_rate": 3.13e-05, "inf_nan_count": 0 }, { "step": 5025, "loss": 7.5371, "learning_rate": 3.14e-05, "inf_nan_count": 0 }, { "step": 5050, "loss": 7.5179, "learning_rate": 3.16e-05, "inf_nan_count": 0 }, { "step": 5075, "loss": 7.5255, "learning_rate": 3.17e-05, "inf_nan_count": 0 }, { "step": 5100, "loss": 7.5155, "learning_rate": 3.19e-05, "inf_nan_count": 0 }, { "step": 5125, "loss": 7.566, "learning_rate": 3.2e-05, "inf_nan_count": 0 }, { "step": 5150, "loss": 7.4797, "learning_rate": 3.22e-05, "inf_nan_count": 0 }, { "step": 5175, "loss": 7.6224, "learning_rate": 3.23e-05, "inf_nan_count": 0 }, { "step": 5200, "loss": 7.4821, "learning_rate": 3.25e-05, "inf_nan_count": 0 }, { "step": 5225, "loss": 7.4765, "learning_rate": 3.27e-05, "inf_nan_count": 0 }, { "step": 5250, "loss": 7.468, "learning_rate": 3.28e-05, "inf_nan_count": 0 }, { "step": 5275, "loss": 7.5165, "learning_rate": 3.3e-05, "inf_nan_count": 0 }, { "step": 5300, "loss": 7.5334, "learning_rate": 3.31e-05, "inf_nan_count": 0 }, { "step": 5325, "loss": 7.5053, "learning_rate": 3.33e-05, "inf_nan_count": 0 }, { "step": 5350, "loss": 7.5115, "learning_rate": 3.34e-05, "inf_nan_count": 0 }, { "step": 5375, "loss": 7.4736, "learning_rate": 3.36e-05, "inf_nan_count": 0 }, { "step": 5400, "loss": 7.452, "learning_rate": 3.38e-05, "inf_nan_count": 0 }, { "step": 5425, "loss": 7.4596, "learning_rate": 3.39e-05, "inf_nan_count": 0 }, { "step": 5450, "loss": 7.4518, "learning_rate": 3.41e-05, "inf_nan_count": 0 }, { "step": 5475, "loss": 7.4308, "learning_rate": 3.42e-05, "inf_nan_count": 0 }, { "step": 5500, "loss": 7.4627, "learning_rate": 3.44e-05, "inf_nan_count": 0 }, { "step": 5525, "loss": 7.4095, "learning_rate": 3.45e-05, "inf_nan_count": 0 }, { "step": 5550, "loss": 7.4423, "learning_rate": 3.47e-05, "inf_nan_count": 0 }, { "step": 5575, "loss": 7.46, "learning_rate": 3.48e-05, "inf_nan_count": 0 }, { "step": 5600, "loss": 7.3457, "learning_rate": 3.5e-05, "inf_nan_count": 0 }, { "step": 5625, "loss": 7.4838, "learning_rate": 3.52e-05, "inf_nan_count": 0 }, { "step": 5650, "loss": 7.4556, "learning_rate": 3.53e-05, "inf_nan_count": 0 }, { "step": 5675, "loss": 7.422, "learning_rate": 3.55e-05, "inf_nan_count": 0 }, { "step": 5700, "loss": 7.4307, "learning_rate": 3.56e-05, "inf_nan_count": 0 }, { "step": 5725, "loss": 7.3795, "learning_rate": 3.58e-05, "inf_nan_count": 0 }, { "step": 5750, "loss": 7.3855, "learning_rate": 3.59e-05, "inf_nan_count": 0 }, { "step": 5775, "loss": 7.3518, "learning_rate": 3.61e-05, "inf_nan_count": 0 }, { "step": 5800, "loss": 7.3794, "learning_rate": 3.63e-05, "inf_nan_count": 0 }, { "step": 5825, "loss": 7.3591, "learning_rate": 3.64e-05, "inf_nan_count": 0 }, { "step": 5850, "loss": 7.3489, "learning_rate": 3.66e-05, "inf_nan_count": 0 }, { "step": 5875, "loss": 7.4108, "learning_rate": 3.67e-05, "inf_nan_count": 0 }, { "step": 5900, "loss": 7.358, "learning_rate": 3.69e-05, "inf_nan_count": 0 }, { "step": 5925, "loss": 7.3131, "learning_rate": 3.7e-05, "inf_nan_count": 0 }, { "step": 5950, "loss": 7.2905, "learning_rate": 3.72e-05, "inf_nan_count": 0 }, { "step": 5975, "loss": 7.3466, "learning_rate": 3.73e-05, "inf_nan_count": 0 }, { "step": 6000, "loss": 7.3765, "learning_rate": 3.75e-05, "inf_nan_count": 0 }, { "step": 6025, "loss": 7.287, "learning_rate": 3.77e-05, "inf_nan_count": 0 }, { "step": 6050, "loss": 7.3333, "learning_rate": 3.78e-05, "inf_nan_count": 0 }, { "step": 6075, "loss": 7.3098, "learning_rate": 3.8e-05, "inf_nan_count": 0 }, { "step": 6100, "loss": 7.2594, "learning_rate": 3.81e-05, "inf_nan_count": 0 }, { "step": 6125, "loss": 7.3327, "learning_rate": 3.83e-05, "inf_nan_count": 0 }, { "step": 6150, "loss": 7.303, "learning_rate": 3.84e-05, "inf_nan_count": 0 }, { "step": 6175, "loss": 7.2523, "learning_rate": 3.86e-05, "inf_nan_count": 0 }, { "step": 6200, "loss": 7.2546, "learning_rate": 3.87e-05, "inf_nan_count": 0 }, { "step": 6225, "loss": 7.3242, "learning_rate": 3.89e-05, "inf_nan_count": 0 }, { "step": 6250, "loss": 7.2035, "learning_rate": 3.91e-05, "inf_nan_count": 0 }, { "step": 6275, "loss": 7.2334, "learning_rate": 3.92e-05, "inf_nan_count": 0 }, { "step": 6300, "loss": 7.2295, "learning_rate": 3.94e-05, "inf_nan_count": 0 }, { "step": 6325, "loss": 7.3051, "learning_rate": 3.95e-05, "inf_nan_count": 0 }, { "step": 6350, "loss": 7.3188, "learning_rate": 3.97e-05, "inf_nan_count": 0 }, { "step": 6375, "loss": 7.3212, "learning_rate": 3.98e-05, "inf_nan_count": 0 }, { "step": 6400, "loss": 7.2465, "learning_rate": 4e-05, "inf_nan_count": 0 }, { "step": 6425, "loss": 7.2081, "learning_rate": 4.02e-05, "inf_nan_count": 0 }, { "step": 6450, "loss": 7.2852, "learning_rate": 4.03e-05, "inf_nan_count": 0 }, { "step": 6475, "loss": 7.2074, "learning_rate": 4.05e-05, "inf_nan_count": 0 }, { "step": 6500, "loss": 7.252, "learning_rate": 4.06e-05, "inf_nan_count": 0 }, { "step": 6525, "loss": 7.2115, "learning_rate": 4.08e-05, "inf_nan_count": 0 }, { "step": 6550, "loss": 7.2435, "learning_rate": 4.09e-05, "inf_nan_count": 0 }, { "step": 6575, "loss": 7.1962, "learning_rate": 4.11e-05, "inf_nan_count": 0 }, { "step": 6600, "loss": 7.1631, "learning_rate": 4.12e-05, "inf_nan_count": 0 }, { "step": 6625, "loss": 7.2525, "learning_rate": 4.14e-05, "inf_nan_count": 0 }, { "step": 6650, "loss": 7.2133, "learning_rate": 4.16e-05, "inf_nan_count": 0 }, { "step": 6675, "loss": 7.2248, "learning_rate": 4.17e-05, "inf_nan_count": 0 }, { "step": 6700, "loss": 7.1928, "learning_rate": 4.19e-05, "inf_nan_count": 0 }, { "step": 6725, "loss": 7.1698, "learning_rate": 4.2e-05, "inf_nan_count": 0 }, { "step": 6750, "loss": 7.3037, "learning_rate": 4.22e-05, "inf_nan_count": 0 }, { "step": 6775, "loss": 7.2451, "learning_rate": 4.23e-05, "inf_nan_count": 0 }, { "step": 6800, "loss": 7.1373, "learning_rate": 4.25e-05, "inf_nan_count": 0 }, { "step": 6825, "loss": 7.139, "learning_rate": 4.27e-05, "inf_nan_count": 0 }, { "step": 6850, "loss": 7.1296, "learning_rate": 4.28e-05, "inf_nan_count": 0 }, { "step": 6875, "loss": 7.0961, "learning_rate": 4.3e-05, "inf_nan_count": 0 }, { "step": 6900, "loss": 7.1408, "learning_rate": 4.31e-05, "inf_nan_count": 0 }, { "step": 6925, "loss": 7.1852, "learning_rate": 4.33e-05, "inf_nan_count": 0 }, { "step": 6950, "loss": 7.2067, "learning_rate": 4.34e-05, "inf_nan_count": 0 }, { "step": 6975, "loss": 7.0681, "learning_rate": 4.36e-05, "inf_nan_count": 0 }, { "step": 7000, "loss": 7.1813, "learning_rate": 4.37e-05, "inf_nan_count": 0 }, { "step": 7025, "loss": 7.1992, "learning_rate": 4.39e-05, "inf_nan_count": 0 }, { "step": 7050, "loss": 7.1409, "learning_rate": 4.41e-05, "inf_nan_count": 0 }, { "step": 7075, "loss": 7.1271, "learning_rate": 4.42e-05, "inf_nan_count": 0 }, { "step": 7100, "loss": 7.172, "learning_rate": 4.44e-05, "inf_nan_count": 0 }, { "step": 7125, "loss": 7.1515, "learning_rate": 4.45e-05, "inf_nan_count": 0 }, { "step": 7150, "loss": 7.0898, "learning_rate": 4.47e-05, "inf_nan_count": 0 }, { "step": 7175, "loss": 7.0996, "learning_rate": 4.48e-05, "inf_nan_count": 0 }, { "step": 7200, "loss": 7.061, "learning_rate": 4.5e-05, "inf_nan_count": 0 }, { "step": 7225, "loss": 7.1939, "learning_rate": 4.52e-05, "inf_nan_count": 0 }, { "step": 7250, "loss": 7.0355, "learning_rate": 4.53e-05, "inf_nan_count": 0 }, { "step": 7275, "loss": 7.0935, "learning_rate": 4.55e-05, "inf_nan_count": 0 }, { "step": 7300, "loss": 7.0689, "learning_rate": 4.56e-05, "inf_nan_count": 0 }, { "step": 7325, "loss": 7.0265, "learning_rate": 4.58e-05, "inf_nan_count": 0 }, { "step": 7350, "loss": 7.0963, "learning_rate": 4.59e-05, "inf_nan_count": 0 }, { "step": 7375, "loss": 7.1138, "learning_rate": 4.61e-05, "inf_nan_count": 0 }, { "step": 7400, "loss": 7.0414, "learning_rate": 4.63e-05, "inf_nan_count": 0 }, { "step": 7425, "loss": 7.0753, "learning_rate": 4.64e-05, "inf_nan_count": 0 }, { "step": 7450, "loss": 7.0603, "learning_rate": 4.66e-05, "inf_nan_count": 0 }, { "step": 7475, "loss": 7.0818, "learning_rate": 4.67e-05, "inf_nan_count": 0 }, { "step": 7500, "loss": 7.0788, "learning_rate": 4.69e-05, "inf_nan_count": 0 }, { "step": 7525, "loss": 6.9952, "learning_rate": 4.7e-05, "inf_nan_count": 0 }, { "step": 7550, "loss": 7.0114, "learning_rate": 4.72e-05, "inf_nan_count": 0 }, { "step": 7575, "loss": 7.0611, "learning_rate": 4.73e-05, "inf_nan_count": 0 }, { "step": 7600, "loss": 7.0057, "learning_rate": 4.75e-05, "inf_nan_count": 0 }, { "step": 7625, "loss": 7.0182, "learning_rate": 4.77e-05, "inf_nan_count": 0 }, { "step": 7650, "loss": 7.0271, "learning_rate": 4.78e-05, "inf_nan_count": 0 }, { "step": 7675, "loss": 7.0817, "learning_rate": 4.8e-05, "inf_nan_count": 0 }, { "step": 7700, "loss": 7.0859, "learning_rate": 4.81e-05, "inf_nan_count": 0 }, { "step": 7725, "loss": 6.9859, "learning_rate": 4.83e-05, "inf_nan_count": 0 }, { "step": 7750, "loss": 7.038, "learning_rate": 4.84e-05, "inf_nan_count": 0 }, { "step": 7775, "loss": 6.9784, "learning_rate": 4.86e-05, "inf_nan_count": 0 }, { "step": 7800, "loss": 7.0304, "learning_rate": 4.87e-05, "inf_nan_count": 0 }, { "step": 7825, "loss": 7.0, "learning_rate": 4.89e-05, "inf_nan_count": 0 }, { "step": 7850, "loss": 7.0159, "learning_rate": 4.91e-05, "inf_nan_count": 0 }, { "step": 7875, "loss": 6.9859, "learning_rate": 4.92e-05, "inf_nan_count": 0 }, { "step": 7900, "loss": 6.9348, "learning_rate": 4.94e-05, "inf_nan_count": 0 }, { "step": 7925, "loss": 6.9541, "learning_rate": 4.95e-05, "inf_nan_count": 0 }, { "step": 7950, "loss": 6.9342, "learning_rate": 4.97e-05, "inf_nan_count": 0 }, { "step": 7975, "loss": 7.0294, "learning_rate": 4.98e-05, "inf_nan_count": 0 }, { "step": 8000, "loss": 7.0412, "learning_rate": 5e-05, "inf_nan_count": 0 }, { "step": 8025, "loss": 6.9111, "learning_rate": 4.99e-05, "inf_nan_count": 0 }, { "step": 8050, "loss": 7.0142, "learning_rate": 4.98e-05, "inf_nan_count": 0 }, { "step": 8075, "loss": 6.9201, "learning_rate": 4.97e-05, "inf_nan_count": 0 }, { "step": 8100, "loss": 6.91, "learning_rate": 4.96e-05, "inf_nan_count": 0 }, { "step": 8125, "loss": 6.9728, "learning_rate": 4.95e-05, "inf_nan_count": 0 }, { "step": 8150, "loss": 6.9963, "learning_rate": 4.94e-05, "inf_nan_count": 0 }, { "step": 8175, "loss": 7.0077, "learning_rate": 4.93e-05, "inf_nan_count": 0 }, { "step": 8200, "loss": 6.8808, "learning_rate": 4.92e-05, "inf_nan_count": 0 }, { "step": 8225, "loss": 6.85, "learning_rate": 4.91e-05, "inf_nan_count": 0 }, { "step": 8250, "loss": 6.9328, "learning_rate": 4.9e-05, "inf_nan_count": 0 }, { "step": 8275, "loss": 6.8971, "learning_rate": 4.89e-05, "inf_nan_count": 0 }, { "step": 8300, "loss": 6.9635, "learning_rate": 4.87e-05, "inf_nan_count": 0 }, { "step": 8325, "loss": 6.8937, "learning_rate": 4.86e-05, "inf_nan_count": 0 }, { "step": 8350, "loss": 6.8578, "learning_rate": 4.85e-05, "inf_nan_count": 0 }, { "step": 8375, "loss": 6.9492, "learning_rate": 4.84e-05, "inf_nan_count": 0 }, { "step": 8400, "loss": 6.8896, "learning_rate": 4.83e-05, "inf_nan_count": 0 }, { "step": 8425, "loss": 6.9677, "learning_rate": 4.82e-05, "inf_nan_count": 0 }, { "step": 8450, "loss": 6.9071, "learning_rate": 4.81e-05, "inf_nan_count": 0 }, { "step": 8475, "loss": 6.8973, "learning_rate": 4.8e-05, "inf_nan_count": 0 }, { "step": 8500, "loss": 6.9139, "learning_rate": 4.79e-05, "inf_nan_count": 0 }, { "step": 8525, "loss": 6.8983, "learning_rate": 4.78e-05, "inf_nan_count": 0 }, { "step": 8550, "loss": 6.8446, "learning_rate": 4.77e-05, "inf_nan_count": 0 }, { "step": 8575, "loss": 6.8246, "learning_rate": 4.76e-05, "inf_nan_count": 0 }, { "step": 8600, "loss": 6.9637, "learning_rate": 4.75e-05, "inf_nan_count": 0 }, { "step": 8625, "loss": 6.8827, "learning_rate": 4.74e-05, "inf_nan_count": 0 }, { "step": 8650, "loss": 6.8234, "learning_rate": 4.73e-05, "inf_nan_count": 0 }, { "step": 8675, "loss": 6.827, "learning_rate": 4.72e-05, "inf_nan_count": 0 }, { "step": 8700, "loss": 6.9554, "learning_rate": 4.71e-05, "inf_nan_count": 0 }, { "step": 8725, "loss": 6.8406, "learning_rate": 4.7e-05, "inf_nan_count": 0 }, { "step": 8750, "loss": 6.8328, "learning_rate": 4.69e-05, "inf_nan_count": 0 }, { "step": 8775, "loss": 6.8362, "learning_rate": 4.68e-05, "inf_nan_count": 0 }, { "step": 8800, "loss": 6.8417, "learning_rate": 4.67e-05, "inf_nan_count": 0 }, { "step": 8825, "loss": 6.8248, "learning_rate": 4.66e-05, "inf_nan_count": 0 }, { "step": 8850, "loss": 6.7996, "learning_rate": 4.65e-05, "inf_nan_count": 0 }, { "step": 8875, "loss": 6.7804, "learning_rate": 4.64e-05, "inf_nan_count": 0 }, { "step": 8900, "loss": 6.8802, "learning_rate": 4.63e-05, "inf_nan_count": 0 }, { "step": 8925, "loss": 6.8586, "learning_rate": 4.61e-05, "inf_nan_count": 0 }, { "step": 8950, "loss": 6.8489, "learning_rate": 4.6e-05, "inf_nan_count": 0 }, { "step": 8975, "loss": 6.8592, "learning_rate": 4.59e-05, "inf_nan_count": 0 }, { "step": 9000, "loss": 6.8302, "learning_rate": 4.58e-05, "inf_nan_count": 0 }, { "step": 9025, "loss": 6.831, "learning_rate": 4.57e-05, "inf_nan_count": 0 }, { "step": 9050, "loss": 6.7991, "learning_rate": 4.56e-05, "inf_nan_count": 0 }, { "step": 9075, "loss": 6.8311, "learning_rate": 4.55e-05, "inf_nan_count": 0 }, { "step": 9100, "loss": 6.7647, "learning_rate": 4.54e-05, "inf_nan_count": 0 }, { "step": 9125, "loss": 6.8225, "learning_rate": 4.53e-05, "inf_nan_count": 0 }, { "step": 9150, "loss": 6.7571, "learning_rate": 4.52e-05, "inf_nan_count": 0 }, { "step": 9175, "loss": 6.806, "learning_rate": 4.51e-05, "inf_nan_count": 0 }, { "step": 9200, "loss": 6.8348, "learning_rate": 4.5e-05, "inf_nan_count": 0 }, { "step": 9225, "loss": 6.9131, "learning_rate": 4.49e-05, "inf_nan_count": 0 }, { "step": 9250, "loss": 6.7801, "learning_rate": 4.48e-05, "inf_nan_count": 0 }, { "step": 9275, "loss": 6.7776, "learning_rate": 4.47e-05, "inf_nan_count": 0 }, { "step": 9300, "loss": 6.716, "learning_rate": 4.46e-05, "inf_nan_count": 0 }, { "step": 9325, "loss": 6.8958, "learning_rate": 4.45e-05, "inf_nan_count": 0 }, { "step": 9350, "loss": 6.8734, "learning_rate": 4.44e-05, "inf_nan_count": 0 }, { "step": 9375, "loss": 6.7203, "learning_rate": 4.43e-05, "inf_nan_count": 0 }, { "step": 9400, "loss": 6.7133, "learning_rate": 4.42e-05, "inf_nan_count": 0 }, { "step": 9425, "loss": 6.8392, "learning_rate": 4.41e-05, "inf_nan_count": 0 }, { "step": 9450, "loss": 6.7945, "learning_rate": 4.4e-05, "inf_nan_count": 0 }, { "step": 9475, "loss": 6.7831, "learning_rate": 4.39e-05, "inf_nan_count": 0 }, { "step": 9500, "loss": 6.7336, "learning_rate": 4.37e-05, "inf_nan_count": 0 }, { "step": 9525, "loss": 6.7529, "learning_rate": 4.36e-05, "inf_nan_count": 0 }, { "step": 9550, "loss": 6.6838, "learning_rate": 4.35e-05, "inf_nan_count": 0 }, { "step": 9575, "loss": 6.7548, "learning_rate": 4.34e-05, "inf_nan_count": 0 }, { "step": 9600, "loss": 6.8837, "learning_rate": 4.33e-05, "inf_nan_count": 0 }, { "step": 9625, "loss": 6.8271, "learning_rate": 4.32e-05, "inf_nan_count": 0 }, { "step": 9650, "loss": 6.7446, "learning_rate": 4.31e-05, "inf_nan_count": 0 }, { "step": 9675, "loss": 6.6811, "learning_rate": 4.3e-05, "inf_nan_count": 0 }, { "step": 9700, "loss": 6.7641, "learning_rate": 4.29e-05, "inf_nan_count": 0 }, { "step": 9725, "loss": 6.6779, "learning_rate": 4.28e-05, "inf_nan_count": 0 }, { "step": 9750, "loss": 6.7428, "learning_rate": 4.27e-05, "inf_nan_count": 0 }, { "step": 9775, "loss": 6.7698, "learning_rate": 4.26e-05, "inf_nan_count": 0 }, { "step": 9800, "loss": 6.7282, "learning_rate": 4.25e-05, "inf_nan_count": 0 }, { "step": 9825, "loss": 6.7314, "learning_rate": 4.24e-05, "inf_nan_count": 0 }, { "step": 9850, "loss": 6.7281, "learning_rate": 4.23e-05, "inf_nan_count": 0 }, { "step": 9875, "loss": 6.8553, "learning_rate": 4.22e-05, "inf_nan_count": 0 }, { "step": 9900, "loss": 6.7912, "learning_rate": 4.21e-05, "inf_nan_count": 0 }, { "step": 9925, "loss": 6.7301, "learning_rate": 4.2e-05, "inf_nan_count": 0 }, { "step": 9950, "loss": 6.7467, "learning_rate": 4.19e-05, "inf_nan_count": 0 }, { "step": 9975, "loss": 6.6581, "learning_rate": 4.18e-05, "inf_nan_count": 0 }, { "step": 10000, "loss": 6.7114, "learning_rate": 4.17e-05, "inf_nan_count": 0 }, { "step": 10025, "loss": 6.7754, "learning_rate": 4.16e-05, "inf_nan_count": 0 }, { "step": 10050, "loss": 6.695, "learning_rate": 4.15e-05, "inf_nan_count": 0 }, { "step": 10075, "loss": 6.6791, "learning_rate": 4.14e-05, "inf_nan_count": 0 }, { "step": 10100, "loss": 6.6957, "learning_rate": 4.12e-05, "inf_nan_count": 0 }, { "step": 10125, "loss": 6.7073, "learning_rate": 4.11e-05, "inf_nan_count": 0 }, { "step": 10150, "loss": 6.774, "learning_rate": 4.1e-05, "inf_nan_count": 0 }, { "step": 10175, "loss": 6.8045, "learning_rate": 4.09e-05, "inf_nan_count": 0 }, { "step": 10200, "loss": 6.761, "learning_rate": 4.08e-05, "inf_nan_count": 0 }, { "step": 10225, "loss": 6.6995, "learning_rate": 4.07e-05, "inf_nan_count": 0 }, { "step": 10250, "loss": 6.6779, "learning_rate": 4.06e-05, "inf_nan_count": 0 }, { "step": 10275, "loss": 6.7462, "learning_rate": 4.05e-05, "inf_nan_count": 0 }, { "step": 10300, "loss": 6.7099, "learning_rate": 4.04e-05, "inf_nan_count": 0 }, { "step": 10325, "loss": 6.7013, "learning_rate": 4.03e-05, "inf_nan_count": 0 }, { "step": 10350, "loss": 6.7173, "learning_rate": 4.02e-05, "inf_nan_count": 0 }, { "step": 10375, "loss": 6.6967, "learning_rate": 4.01e-05, "inf_nan_count": 0 }, { "step": 10400, "loss": 6.7565, "learning_rate": 4e-05, "inf_nan_count": 0 }, { "step": 10425, "loss": 6.7468, "learning_rate": 3.99e-05, "inf_nan_count": 0 }, { "step": 10450, "loss": 6.7132, "learning_rate": 3.98e-05, "inf_nan_count": 0 }, { "step": 10475, "loss": 6.6358, "learning_rate": 3.97e-05, "inf_nan_count": 0 }, { "step": 10500, "loss": 6.6979, "learning_rate": 3.96e-05, "inf_nan_count": 0 }, { "step": 10525, "loss": 6.6512, "learning_rate": 3.95e-05, "inf_nan_count": 0 }, { "step": 10550, "loss": 6.6045, "learning_rate": 3.94e-05, "inf_nan_count": 0 }, { "step": 10575, "loss": 6.6217, "learning_rate": 3.93e-05, "inf_nan_count": 0 }, { "step": 10600, "loss": 6.7091, "learning_rate": 3.92e-05, "inf_nan_count": 0 }, { "step": 10625, "loss": 6.618, "learning_rate": 3.91e-05, "inf_nan_count": 0 }, { "step": 10650, "loss": 6.6743, "learning_rate": 3.9e-05, "inf_nan_count": 0 }, { "step": 10675, "loss": 6.6481, "learning_rate": 3.89e-05, "inf_nan_count": 0 }, { "step": 10700, "loss": 6.6888, "learning_rate": 3.87e-05, "inf_nan_count": 0 }, { "step": 10725, "loss": 6.5786, "learning_rate": 3.86e-05, "inf_nan_count": 0 }, { "step": 10750, "loss": 6.6917, "learning_rate": 3.85e-05, "inf_nan_count": 0 }, { "step": 10775, "loss": 6.6487, "learning_rate": 3.84e-05, "inf_nan_count": 0 }, { "step": 10800, "loss": 6.7293, "learning_rate": 3.83e-05, "inf_nan_count": 0 }, { "step": 10825, "loss": 6.6369, "learning_rate": 3.82e-05, "inf_nan_count": 0 }, { "step": 10850, "loss": 6.7118, "learning_rate": 3.81e-05, "inf_nan_count": 0 }, { "step": 10875, "loss": 6.7235, "learning_rate": 3.8e-05, "inf_nan_count": 0 }, { "step": 10900, "loss": 6.6963, "learning_rate": 3.79e-05, "inf_nan_count": 0 }, { "step": 10925, "loss": 6.6791, "learning_rate": 3.78e-05, "inf_nan_count": 0 }, { "step": 10950, "loss": 6.6773, "learning_rate": 3.77e-05, "inf_nan_count": 0 }, { "step": 10975, "loss": 6.6819, "learning_rate": 3.76e-05, "inf_nan_count": 0 }, { "step": 11000, "loss": 6.6167, "learning_rate": 3.75e-05, "inf_nan_count": 0 }, { "step": 11025, "loss": 6.6727, "learning_rate": 3.74e-05, "inf_nan_count": 0 }, { "step": 11050, "loss": 6.6317, "learning_rate": 3.73e-05, "inf_nan_count": 0 }, { "step": 11075, "loss": 6.6432, "learning_rate": 3.72e-05, "inf_nan_count": 0 }, { "step": 11100, "loss": 6.6468, "learning_rate": 3.71e-05, "inf_nan_count": 0 }, { "step": 11125, "loss": 6.646, "learning_rate": 3.7e-05, "inf_nan_count": 0 }, { "step": 11150, "loss": 6.6852, "learning_rate": 3.69e-05, "inf_nan_count": 0 }, { "step": 11175, "loss": 6.5716, "learning_rate": 3.68e-05, "inf_nan_count": 0 }, { "step": 11200, "loss": 6.6311, "learning_rate": 3.67e-05, "inf_nan_count": 0 }, { "step": 11225, "loss": 6.648, "learning_rate": 3.66e-05, "inf_nan_count": 0 }, { "step": 11250, "loss": 6.6204, "learning_rate": 3.65e-05, "inf_nan_count": 0 }, { "step": 11275, "loss": 6.6551, "learning_rate": 3.64e-05, "inf_nan_count": 0 }, { "step": 11300, "loss": 6.6013, "learning_rate": 3.63e-05, "inf_nan_count": 0 }, { "step": 11325, "loss": 6.6478, "learning_rate": 3.61e-05, "inf_nan_count": 0 }, { "step": 11350, "loss": 6.6938, "learning_rate": 3.6e-05, "inf_nan_count": 0 }, { "step": 11375, "loss": 6.6124, "learning_rate": 3.59e-05, "inf_nan_count": 0 }, { "step": 11400, "loss": 6.6781, "learning_rate": 3.58e-05, "inf_nan_count": 0 }, { "step": 11425, "loss": 6.6317, "learning_rate": 3.57e-05, "inf_nan_count": 0 }, { "step": 11450, "loss": 6.6195, "learning_rate": 3.56e-05, "inf_nan_count": 0 }, { "step": 11475, "loss": 6.5941, "learning_rate": 3.55e-05, "inf_nan_count": 0 }, { "step": 11500, "loss": 6.5808, "learning_rate": 3.54e-05, "inf_nan_count": 0 }, { "step": 11525, "loss": 6.6322, "learning_rate": 3.53e-05, "inf_nan_count": 0 }, { "step": 11550, "loss": 6.6172, "learning_rate": 3.52e-05, "inf_nan_count": 0 }, { "step": 11575, "loss": 6.649, "learning_rate": 3.51e-05, "inf_nan_count": 0 }, { "step": 11600, "loss": 6.605, "learning_rate": 3.5e-05, "inf_nan_count": 0 }, { "step": 11625, "loss": 6.6184, "learning_rate": 3.49e-05, "inf_nan_count": 0 }, { "step": 11650, "loss": 6.5597, "learning_rate": 3.48e-05, "inf_nan_count": 0 }, { "step": 11675, "loss": 6.6285, "learning_rate": 3.47e-05, "inf_nan_count": 0 }, { "step": 11700, "loss": 6.5209, "learning_rate": 3.46e-05, "inf_nan_count": 0 }, { "step": 11725, "loss": 6.5505, "learning_rate": 3.45e-05, "inf_nan_count": 0 }, { "step": 11750, "loss": 6.671, "learning_rate": 3.44e-05, "inf_nan_count": 0 }, { "step": 11775, "loss": 6.6403, "learning_rate": 3.43e-05, "inf_nan_count": 0 }, { "step": 11800, "loss": 6.5738, "learning_rate": 3.42e-05, "inf_nan_count": 0 }, { "step": 11825, "loss": 6.608, "learning_rate": 3.41e-05, "inf_nan_count": 0 }, { "step": 11850, "loss": 6.6406, "learning_rate": 3.4e-05, "inf_nan_count": 0 }, { "step": 11875, "loss": 6.6299, "learning_rate": 3.39e-05, "inf_nan_count": 0 }, { "step": 11900, "loss": 6.5781, "learning_rate": 3.38e-05, "inf_nan_count": 0 }, { "step": 11925, "loss": 6.5003, "learning_rate": 3.36e-05, "inf_nan_count": 0 }, { "step": 11950, "loss": 6.635, "learning_rate": 3.35e-05, "inf_nan_count": 0 }, { "step": 11975, "loss": 6.618, "learning_rate": 3.34e-05, "inf_nan_count": 0 }, { "step": 12000, "loss": 6.6603, "learning_rate": 3.33e-05, "inf_nan_count": 0 }, { "step": 12025, "loss": 6.5507, "learning_rate": 3.32e-05, "inf_nan_count": 0 }, { "step": 12050, "loss": 6.5878, "learning_rate": 3.31e-05, "inf_nan_count": 0 }, { "step": 12075, "loss": 6.5245, "learning_rate": 3.3e-05, "inf_nan_count": 0 }, { "step": 12100, "loss": 6.5629, "learning_rate": 3.29e-05, "inf_nan_count": 0 }, { "step": 12125, "loss": 6.6181, "learning_rate": 3.28e-05, "inf_nan_count": 0 }, { "step": 12150, "loss": 6.578, "learning_rate": 3.27e-05, "inf_nan_count": 0 }, { "step": 12175, "loss": 6.5753, "learning_rate": 3.26e-05, "inf_nan_count": 0 }, { "step": 12200, "loss": 6.6071, "learning_rate": 3.25e-05, "inf_nan_count": 0 }, { "step": 12225, "loss": 6.5885, "learning_rate": 3.24e-05, "inf_nan_count": 0 }, { "step": 12250, "loss": 6.5413, "learning_rate": 3.23e-05, "inf_nan_count": 0 }, { "step": 12275, "loss": 6.6635, "learning_rate": 3.22e-05, "inf_nan_count": 0 }, { "step": 12300, "loss": 6.6304, "learning_rate": 3.21e-05, "inf_nan_count": 0 }, { "step": 12325, "loss": 6.5078, "learning_rate": 3.2e-05, "inf_nan_count": 0 }, { "step": 12350, "loss": 6.5712, "learning_rate": 3.19e-05, "inf_nan_count": 0 }, { "step": 12375, "loss": 6.6284, "learning_rate": 3.18e-05, "inf_nan_count": 0 }, { "step": 12400, "loss": 6.5837, "learning_rate": 3.17e-05, "inf_nan_count": 0 }, { "step": 12425, "loss": 6.5354, "learning_rate": 3.16e-05, "inf_nan_count": 0 }, { "step": 12450, "loss": 6.6125, "learning_rate": 3.15e-05, "inf_nan_count": 0 }, { "step": 12475, "loss": 6.5477, "learning_rate": 3.14e-05, "inf_nan_count": 0 }, { "step": 12500, "loss": 6.5827, "learning_rate": 3.13e-05, "inf_nan_count": 0 }, { "step": 12525, "loss": 6.5874, "learning_rate": 3.11e-05, "inf_nan_count": 0 }, { "step": 12550, "loss": 6.5437, "learning_rate": 3.1e-05, "inf_nan_count": 0 }, { "step": 12575, "loss": 6.582, "learning_rate": 3.09e-05, "inf_nan_count": 0 }, { "step": 12600, "loss": 6.5286, "learning_rate": 3.08e-05, "inf_nan_count": 0 }, { "step": 12625, "loss": 6.5144, "learning_rate": 3.07e-05, "inf_nan_count": 0 }, { "step": 12650, "loss": 6.5327, "learning_rate": 3.06e-05, "inf_nan_count": 0 }, { "step": 12675, "loss": 6.6058, "learning_rate": 3.05e-05, "inf_nan_count": 0 }, { "step": 12700, "loss": 6.5626, "learning_rate": 3.04e-05, "inf_nan_count": 0 }, { "step": 12725, "loss": 6.4589, "learning_rate": 3.03e-05, "inf_nan_count": 0 }, { "step": 12750, "loss": 6.5629, "learning_rate": 3.02e-05, "inf_nan_count": 0 }, { "step": 12775, "loss": 6.4815, "learning_rate": 3.01e-05, "inf_nan_count": 0 }, { "step": 12800, "loss": 6.5651, "learning_rate": 3e-05, "inf_nan_count": 0 }, { "step": 12825, "loss": 6.6164, "learning_rate": 2.99e-05, "inf_nan_count": 0 }, { "step": 12850, "loss": 6.6102, "learning_rate": 2.98e-05, "inf_nan_count": 0 }, { "step": 12875, "loss": 6.4871, "learning_rate": 2.97e-05, "inf_nan_count": 0 }, { "step": 12900, "loss": 6.49, "learning_rate": 2.96e-05, "inf_nan_count": 0 }, { "step": 12925, "loss": 6.6028, "learning_rate": 2.95e-05, "inf_nan_count": 0 }, { "step": 12950, "loss": 6.5509, "learning_rate": 2.94e-05, "inf_nan_count": 0 }, { "step": 12975, "loss": 6.5454, "learning_rate": 2.93e-05, "inf_nan_count": 0 }, { "step": 13000, "loss": 6.5587, "learning_rate": 2.92e-05, "inf_nan_count": 0 }, { "step": 13025, "loss": 6.5862, "learning_rate": 2.91e-05, "inf_nan_count": 0 }, { "step": 13050, "loss": 6.5668, "learning_rate": 2.9e-05, "inf_nan_count": 0 }, { "step": 13075, "loss": 6.522, "learning_rate": 2.89e-05, "inf_nan_count": 0 }, { "step": 13100, "loss": 6.5044, "learning_rate": 2.87e-05, "inf_nan_count": 0 }, { "step": 13125, "loss": 6.6356, "learning_rate": 2.86e-05, "inf_nan_count": 0 }, { "step": 13150, "loss": 6.4772, "learning_rate": 2.85e-05, "inf_nan_count": 0 }, { "step": 13175, "loss": 6.5504, "learning_rate": 2.84e-05, "inf_nan_count": 0 }, { "step": 13200, "loss": 6.5415, "learning_rate": 2.83e-05, "inf_nan_count": 0 }, { "step": 13225, "loss": 6.4651, "learning_rate": 2.82e-05, "inf_nan_count": 0 }, { "step": 13250, "loss": 6.5536, "learning_rate": 2.81e-05, "inf_nan_count": 0 }, { "step": 13275, "loss": 6.4861, "learning_rate": 2.8e-05, "inf_nan_count": 0 }, { "step": 13300, "loss": 6.4688, "learning_rate": 2.79e-05, "inf_nan_count": 0 }, { "step": 13325, "loss": 6.5549, "learning_rate": 2.78e-05, "inf_nan_count": 0 }, { "step": 13350, "loss": 6.4589, "learning_rate": 2.77e-05, "inf_nan_count": 0 }, { "step": 13375, "loss": 6.4644, "learning_rate": 2.76e-05, "inf_nan_count": 0 }, { "step": 13400, "loss": 6.5937, "learning_rate": 2.75e-05, "inf_nan_count": 0 }, { "step": 13425, "loss": 6.5798, "learning_rate": 2.74e-05, "inf_nan_count": 0 }, { "step": 13450, "loss": 6.4615, "learning_rate": 2.73e-05, "inf_nan_count": 0 }, { "step": 13475, "loss": 6.5173, "learning_rate": 2.72e-05, "inf_nan_count": 0 }, { "step": 13500, "loss": 6.4795, "learning_rate": 2.71e-05, "inf_nan_count": 0 }, { "step": 13525, "loss": 6.4789, "learning_rate": 2.7e-05, "inf_nan_count": 0 }, { "step": 13550, "loss": 6.4835, "learning_rate": 2.69e-05, "inf_nan_count": 0 }, { "step": 13575, "loss": 6.5405, "learning_rate": 2.68e-05, "inf_nan_count": 0 }, { "step": 13600, "loss": 6.4616, "learning_rate": 2.67e-05, "inf_nan_count": 0 }, { "step": 13625, "loss": 6.4578, "learning_rate": 2.66e-05, "inf_nan_count": 0 }, { "step": 13650, "loss": 6.4083, "learning_rate": 2.65e-05, "inf_nan_count": 0 }, { "step": 13675, "loss": 6.561, "learning_rate": 2.64e-05, "inf_nan_count": 0 }, { "step": 13700, "loss": 6.5432, "learning_rate": 2.63e-05, "inf_nan_count": 0 }, { "step": 13725, "loss": 6.5119, "learning_rate": 2.61e-05, "inf_nan_count": 0 }, { "step": 13750, "loss": 6.454, "learning_rate": 2.6e-05, "inf_nan_count": 0 }, { "step": 13775, "loss": 6.44, "learning_rate": 2.59e-05, "inf_nan_count": 0 }, { "step": 13800, "loss": 6.4767, "learning_rate": 2.58e-05, "inf_nan_count": 0 }, { "step": 13825, "loss": 6.4765, "learning_rate": 2.57e-05, "inf_nan_count": 0 }, { "step": 13850, "loss": 6.5018, "learning_rate": 2.56e-05, "inf_nan_count": 0 }, { "step": 13875, "loss": 6.5011, "learning_rate": 2.55e-05, "inf_nan_count": 0 }, { "step": 13900, "loss": 6.4283, "learning_rate": 2.54e-05, "inf_nan_count": 0 }, { "step": 13925, "loss": 6.519, "learning_rate": 2.53e-05, "inf_nan_count": 0 }, { "step": 13950, "loss": 6.4388, "learning_rate": 2.52e-05, "inf_nan_count": 0 }, { "step": 13975, "loss": 6.455, "learning_rate": 2.51e-05, "inf_nan_count": 0 }, { "step": 14000, "loss": 6.3491, "learning_rate": 2.5e-05, "inf_nan_count": 0 }, { "step": 14025, "loss": 6.5285, "learning_rate": 2.49e-05, "inf_nan_count": 0 }, { "step": 14050, "loss": 6.5082, "learning_rate": 2.48e-05, "inf_nan_count": 0 }, { "step": 14075, "loss": 6.5451, "learning_rate": 2.47e-05, "inf_nan_count": 0 }, { "step": 14100, "loss": 6.4753, "learning_rate": 2.46e-05, "inf_nan_count": 0 }, { "step": 14125, "loss": 6.6011, "learning_rate": 2.45e-05, "inf_nan_count": 0 }, { "step": 14150, "loss": 6.4885, "learning_rate": 2.44e-05, "inf_nan_count": 0 }, { "step": 14175, "loss": 6.4635, "learning_rate": 2.43e-05, "inf_nan_count": 0 }, { "step": 14200, "loss": 6.5519, "learning_rate": 2.42e-05, "inf_nan_count": 0 }, { "step": 14225, "loss": 6.4356, "learning_rate": 2.41e-05, "inf_nan_count": 0 }, { "step": 14250, "loss": 6.4552, "learning_rate": 2.4e-05, "inf_nan_count": 0 }, { "step": 14275, "loss": 6.4613, "learning_rate": 2.39e-05, "inf_nan_count": 0 }, { "step": 14300, "loss": 6.4411, "learning_rate": 2.38e-05, "inf_nan_count": 0 }, { "step": 14325, "loss": 6.557, "learning_rate": 2.36e-05, "inf_nan_count": 0 }, { "step": 14350, "loss": 6.4476, "learning_rate": 2.35e-05, "inf_nan_count": 0 }, { "step": 14375, "loss": 6.5895, "learning_rate": 2.34e-05, "inf_nan_count": 0 }, { "step": 14400, "loss": 6.4836, "learning_rate": 2.33e-05, "inf_nan_count": 0 }, { "step": 14425, "loss": 6.4175, "learning_rate": 2.32e-05, "inf_nan_count": 0 }, { "step": 14450, "loss": 6.4971, "learning_rate": 2.31e-05, "inf_nan_count": 0 }, { "step": 14475, "loss": 6.4897, "learning_rate": 2.3e-05, "inf_nan_count": 0 }, { "step": 14500, "loss": 6.455, "learning_rate": 2.29e-05, "inf_nan_count": 0 }, { "step": 14525, "loss": 6.4688, "learning_rate": 2.28e-05, "inf_nan_count": 0 }, { "step": 14550, "loss": 6.5494, "learning_rate": 2.27e-05, "inf_nan_count": 0 }, { "step": 14575, "loss": 6.4501, "learning_rate": 2.26e-05, "inf_nan_count": 0 }, { "step": 14600, "loss": 6.5142, "learning_rate": 2.25e-05, "inf_nan_count": 0 }, { "step": 14625, "loss": 6.4891, "learning_rate": 2.24e-05, "inf_nan_count": 0 }, { "step": 14650, "loss": 6.4274, "learning_rate": 2.23e-05, "inf_nan_count": 0 }, { "step": 14675, "loss": 6.5277, "learning_rate": 2.22e-05, "inf_nan_count": 0 }, { "step": 14700, "loss": 6.4472, "learning_rate": 2.21e-05, "inf_nan_count": 0 }, { "step": 14725, "loss": 6.4328, "learning_rate": 2.2e-05, "inf_nan_count": 0 }, { "step": 14750, "loss": 6.4928, "learning_rate": 2.19e-05, "inf_nan_count": 0 }, { "step": 14775, "loss": 6.552, "learning_rate": 2.18e-05, "inf_nan_count": 0 }, { "step": 14800, "loss": 6.5474, "learning_rate": 2.17e-05, "inf_nan_count": 0 }, { "step": 14825, "loss": 6.4394, "learning_rate": 2.16e-05, "inf_nan_count": 0 }, { "step": 14850, "loss": 6.5234, "learning_rate": 2.15e-05, "inf_nan_count": 0 }, { "step": 14875, "loss": 6.4369, "learning_rate": 2.14e-05, "inf_nan_count": 0 }, { "step": 14900, "loss": 6.4694, "learning_rate": 2.13e-05, "inf_nan_count": 0 }, { "step": 14925, "loss": 6.5837, "learning_rate": 2.11e-05, "inf_nan_count": 0 }, { "step": 14950, "loss": 6.4841, "learning_rate": 2.1e-05, "inf_nan_count": 0 }, { "step": 14975, "loss": 6.4347, "learning_rate": 2.09e-05, "inf_nan_count": 0 }, { "step": 15000, "loss": 6.5816, "learning_rate": 2.08e-05, "inf_nan_count": 0 }, { "step": 15025, "loss": 6.5337, "learning_rate": 2.07e-05, "inf_nan_count": 0 }, { "step": 15050, "loss": 6.5131, "learning_rate": 2.06e-05, "inf_nan_count": 0 }, { "step": 15075, "loss": 6.4669, "learning_rate": 2.05e-05, "inf_nan_count": 0 }, { "step": 15100, "loss": 6.5141, "learning_rate": 2.04e-05, "inf_nan_count": 0 }, { "step": 15125, "loss": 6.438, "learning_rate": 2.03e-05, "inf_nan_count": 0 }, { "step": 15150, "loss": 6.4036, "learning_rate": 2.02e-05, "inf_nan_count": 0 }, { "step": 15175, "loss": 6.4517, "learning_rate": 2.01e-05, "inf_nan_count": 0 }, { "step": 15200, "loss": 6.477, "learning_rate": 2e-05, "inf_nan_count": 0 }, { "step": 15225, "loss": 6.4317, "learning_rate": 1.99e-05, "inf_nan_count": 0 }, { "step": 15250, "loss": 6.488, "learning_rate": 1.98e-05, "inf_nan_count": 0 }, { "step": 15275, "loss": 6.4466, "learning_rate": 1.97e-05, "inf_nan_count": 0 }, { "step": 15300, "loss": 6.4248, "learning_rate": 1.96e-05, "inf_nan_count": 0 }, { "step": 15325, "loss": 6.3834, "learning_rate": 1.95e-05, "inf_nan_count": 0 }, { "step": 15350, "loss": 6.4272, "learning_rate": 1.94e-05, "inf_nan_count": 0 }, { "step": 15375, "loss": 6.4834, "learning_rate": 1.93e-05, "inf_nan_count": 0 }, { "step": 15400, "loss": 6.405, "learning_rate": 1.92e-05, "inf_nan_count": 0 }, { "step": 15425, "loss": 6.4264, "learning_rate": 1.91e-05, "inf_nan_count": 0 }, { "step": 15450, "loss": 6.4941, "learning_rate": 1.9e-05, "inf_nan_count": 0 }, { "step": 15475, "loss": 6.4755, "learning_rate": 1.89e-05, "inf_nan_count": 0 }, { "step": 15500, "loss": 6.5459, "learning_rate": 1.88e-05, "inf_nan_count": 0 }, { "step": 15525, "loss": 6.3772, "learning_rate": 1.86e-05, "inf_nan_count": 0 }, { "step": 15550, "loss": 6.443, "learning_rate": 1.85e-05, "inf_nan_count": 0 }, { "step": 15575, "loss": 6.3931, "learning_rate": 1.84e-05, "inf_nan_count": 0 }, { "step": 15600, "loss": 6.4087, "learning_rate": 1.83e-05, "inf_nan_count": 0 }, { "step": 15625, "loss": 6.4743, "learning_rate": 1.82e-05, "inf_nan_count": 0 }, { "step": 15650, "loss": 6.4575, "learning_rate": 1.81e-05, "inf_nan_count": 0 }, { "step": 15675, "loss": 6.4971, "learning_rate": 1.8e-05, "inf_nan_count": 0 }, { "step": 15700, "loss": 6.438, "learning_rate": 1.79e-05, "inf_nan_count": 0 }, { "step": 15725, "loss": 6.5071, "learning_rate": 1.78e-05, "inf_nan_count": 0 }, { "step": 15750, "loss": 6.391, "learning_rate": 1.77e-05, "inf_nan_count": 0 }, { "step": 15775, "loss": 6.4386, "learning_rate": 1.76e-05, "inf_nan_count": 0 }, { "step": 15800, "loss": 6.4268, "learning_rate": 1.75e-05, "inf_nan_count": 0 }, { "step": 15825, "loss": 6.5534, "learning_rate": 1.74e-05, "inf_nan_count": 0 }, { "step": 15850, "loss": 6.4422, "learning_rate": 1.73e-05, "inf_nan_count": 0 }, { "step": 15875, "loss": 6.4075, "learning_rate": 1.72e-05, "inf_nan_count": 0 }, { "step": 15900, "loss": 6.4458, "learning_rate": 1.71e-05, "inf_nan_count": 0 }, { "step": 15925, "loss": 6.3855, "learning_rate": 1.7e-05, "inf_nan_count": 0 }, { "step": 15950, "loss": 6.3659, "learning_rate": 1.69e-05, "inf_nan_count": 0 }, { "step": 15975, "loss": 6.5396, "learning_rate": 1.68e-05, "inf_nan_count": 0 }, { "step": 16000, "loss": 6.4974, "learning_rate": 1.67e-05, "inf_nan_count": 0 }, { "step": 16025, "loss": 6.4785, "learning_rate": 1.66e-05, "inf_nan_count": 0 }, { "step": 16050, "loss": 6.4341, "learning_rate": 1.65e-05, "inf_nan_count": 0 }, { "step": 16075, "loss": 6.3709, "learning_rate": 1.64e-05, "inf_nan_count": 0 }, { "step": 16100, "loss": 6.3707, "learning_rate": 1.63e-05, "inf_nan_count": 0 }, { "step": 16125, "loss": 6.4206, "learning_rate": 1.61e-05, "inf_nan_count": 0 }, { "step": 16150, "loss": 6.397, "learning_rate": 1.6e-05, "inf_nan_count": 0 }, { "step": 16175, "loss": 6.4617, "learning_rate": 1.59e-05, "inf_nan_count": 0 }, { "step": 16200, "loss": 6.5586, "learning_rate": 1.58e-05, "inf_nan_count": 0 }, { "step": 16225, "loss": 6.4248, "learning_rate": 1.57e-05, "inf_nan_count": 0 }, { "step": 16250, "loss": 6.4204, "learning_rate": 1.56e-05, "inf_nan_count": 0 }, { "step": 16275, "loss": 6.4632, "learning_rate": 1.55e-05, "inf_nan_count": 0 }, { "step": 16300, "loss": 6.4491, "learning_rate": 1.54e-05, "inf_nan_count": 0 }, { "step": 16325, "loss": 6.4412, "learning_rate": 1.53e-05, "inf_nan_count": 0 }, { "step": 16350, "loss": 6.4144, "learning_rate": 1.52e-05, "inf_nan_count": 0 }, { "step": 16375, "loss": 6.466, "learning_rate": 1.51e-05, "inf_nan_count": 0 }, { "step": 16400, "loss": 6.4246, "learning_rate": 1.5e-05, "inf_nan_count": 0 }, { "step": 16425, "loss": 6.4571, "learning_rate": 1.49e-05, "inf_nan_count": 0 }, { "step": 16450, "loss": 6.3903, "learning_rate": 1.48e-05, "inf_nan_count": 0 }, { "step": 16475, "loss": 6.4141, "learning_rate": 1.47e-05, "inf_nan_count": 0 }, { "step": 16500, "loss": 6.4467, "learning_rate": 1.46e-05, "inf_nan_count": 0 }, { "step": 16525, "loss": 6.356, "learning_rate": 1.45e-05, "inf_nan_count": 0 }, { "step": 16550, "loss": 6.4049, "learning_rate": 1.44e-05, "inf_nan_count": 0 }, { "step": 16575, "loss": 6.4103, "learning_rate": 1.43e-05, "inf_nan_count": 0 }, { "step": 16600, "loss": 6.4282, "learning_rate": 1.42e-05, "inf_nan_count": 0 }, { "step": 16625, "loss": 6.5397, "learning_rate": 1.41e-05, "inf_nan_count": 0 }, { "step": 16650, "loss": 6.3862, "learning_rate": 1.4e-05, "inf_nan_count": 0 }, { "step": 16675, "loss": 6.4291, "learning_rate": 1.39e-05, "inf_nan_count": 0 }, { "step": 16700, "loss": 6.433, "learning_rate": 1.38e-05, "inf_nan_count": 0 }, { "step": 16725, "loss": 6.3934, "learning_rate": 1.36e-05, "inf_nan_count": 0 }, { "step": 16750, "loss": 6.4042, "learning_rate": 1.35e-05, "inf_nan_count": 0 }, { "step": 16775, "loss": 6.4187, "learning_rate": 1.34e-05, "inf_nan_count": 0 }, { "step": 16800, "loss": 6.4455, "learning_rate": 1.33e-05, "inf_nan_count": 0 }, { "step": 16825, "loss": 6.424, "learning_rate": 1.32e-05, "inf_nan_count": 0 }, { "step": 16850, "loss": 6.4491, "learning_rate": 1.31e-05, "inf_nan_count": 0 }, { "step": 16875, "loss": 6.3993, "learning_rate": 1.3e-05, "inf_nan_count": 0 }, { "step": 16900, "loss": 6.4393, "learning_rate": 1.29e-05, "inf_nan_count": 0 }, { "step": 16925, "loss": 6.3705, "learning_rate": 1.28e-05, "inf_nan_count": 0 }, { "step": 16950, "loss": 6.4404, "learning_rate": 1.27e-05, "inf_nan_count": 0 }, { "step": 16975, "loss": 6.4507, "learning_rate": 1.26e-05, "inf_nan_count": 0 }, { "step": 17000, "loss": 6.3821, "learning_rate": 1.25e-05, "inf_nan_count": 0 }, { "step": 17025, "loss": 6.4234, "learning_rate": 1.24e-05, "inf_nan_count": 0 }, { "step": 17050, "loss": 6.4235, "learning_rate": 1.23e-05, "inf_nan_count": 0 }, { "step": 17075, "loss": 6.4856, "learning_rate": 1.22e-05, "inf_nan_count": 0 }, { "step": 17100, "loss": 6.4877, "learning_rate": 1.21e-05, "inf_nan_count": 0 }, { "step": 17125, "loss": 6.3683, "learning_rate": 1.2e-05, "inf_nan_count": 0 }, { "step": 17150, "loss": 6.4225, "learning_rate": 1.19e-05, "inf_nan_count": 0 }, { "step": 17175, "loss": 6.2573, "learning_rate": 1.18e-05, "inf_nan_count": 0 }, { "step": 17200, "loss": 6.3946, "learning_rate": 1.17e-05, "inf_nan_count": 0 }, { "step": 17225, "loss": 6.4607, "learning_rate": 1.16e-05, "inf_nan_count": 0 }, { "step": 17250, "loss": 6.4407, "learning_rate": 1.15e-05, "inf_nan_count": 0 }, { "step": 17275, "loss": 6.4333, "learning_rate": 1.14e-05, "inf_nan_count": 0 }, { "step": 17300, "loss": 6.3782, "learning_rate": 1.13e-05, "inf_nan_count": 0 }, { "step": 17325, "loss": 6.3665, "learning_rate": 1.11e-05, "inf_nan_count": 0 }, { "step": 17350, "loss": 6.4329, "learning_rate": 1.1e-05, "inf_nan_count": 0 }, { "step": 17375, "loss": 6.5107, "learning_rate": 1.09e-05, "inf_nan_count": 0 }, { "step": 17400, "loss": 6.5076, "learning_rate": 1.08e-05, "inf_nan_count": 0 }, { "step": 17425, "loss": 6.4936, "learning_rate": 1.07e-05, "inf_nan_count": 0 }, { "step": 17450, "loss": 6.4119, "learning_rate": 1.06e-05, "inf_nan_count": 0 }, { "step": 17475, "loss": 6.4032, "learning_rate": 1.05e-05, "inf_nan_count": 0 }, { "step": 17500, "loss": 6.3962, "learning_rate": 1.04e-05, "inf_nan_count": 0 }, { "step": 17525, "loss": 6.4288, "learning_rate": 1.03e-05, "inf_nan_count": 0 }, { "step": 17550, "loss": 6.4021, "learning_rate": 1.02e-05, "inf_nan_count": 0 }, { "step": 17575, "loss": 6.367, "learning_rate": 1.01e-05, "inf_nan_count": 0 }, { "step": 17600, "loss": 6.3904, "learning_rate": 1e-05, "inf_nan_count": 0 }, { "step": 17625, "loss": 6.5059, "learning_rate": 9.9e-06, "inf_nan_count": 0 }, { "step": 17650, "loss": 6.4225, "learning_rate": 9.79e-06, "inf_nan_count": 0 }, { "step": 17675, "loss": 6.4422, "learning_rate": 9.69e-06, "inf_nan_count": 0 }, { "step": 17700, "loss": 6.457, "learning_rate": 9.58e-06, "inf_nan_count": 0 }, { "step": 17725, "loss": 6.4475, "learning_rate": 9.48e-06, "inf_nan_count": 0 }, { "step": 17750, "loss": 6.3786, "learning_rate": 9.38e-06, "inf_nan_count": 0 }, { "step": 17775, "loss": 6.4145, "learning_rate": 9.27e-06, "inf_nan_count": 0 }, { "step": 17800, "loss": 6.3543, "learning_rate": 9.17e-06, "inf_nan_count": 0 }, { "step": 17825, "loss": 6.5116, "learning_rate": 9.06e-06, "inf_nan_count": 0 }, { "step": 17850, "loss": 6.4101, "learning_rate": 8.96e-06, "inf_nan_count": 0 }, { "step": 17875, "loss": 6.4014, "learning_rate": 8.85e-06, "inf_nan_count": 0 }, { "step": 17900, "loss": 6.4216, "learning_rate": 8.75e-06, "inf_nan_count": 0 }, { "step": 17925, "loss": 6.4539, "learning_rate": 8.65e-06, "inf_nan_count": 0 }, { "step": 17950, "loss": 6.4205, "learning_rate": 8.54e-06, "inf_nan_count": 0 }, { "step": 17975, "loss": 6.3865, "learning_rate": 8.44e-06, "inf_nan_count": 0 }, { "step": 18000, "loss": 6.4347, "learning_rate": 8.33e-06, "inf_nan_count": 0 }, { "step": 18025, "loss": 6.4313, "learning_rate": 8.23e-06, "inf_nan_count": 0 }, { "step": 18050, "loss": 6.3868, "learning_rate": 8.13e-06, "inf_nan_count": 0 }, { "step": 18075, "loss": 6.3703, "learning_rate": 8.02e-06, "inf_nan_count": 0 }, { "step": 18100, "loss": 6.3747, "learning_rate": 7.92e-06, "inf_nan_count": 0 }, { "step": 18125, "loss": 6.4228, "learning_rate": 7.81e-06, "inf_nan_count": 0 }, { "step": 18150, "loss": 6.349, "learning_rate": 7.71e-06, "inf_nan_count": 0 }, { "step": 18175, "loss": 6.4522, "learning_rate": 7.6e-06, "inf_nan_count": 0 }, { "step": 18200, "loss": 6.3354, "learning_rate": 7.5e-06, "inf_nan_count": 0 }, { "step": 18225, "loss": 6.4663, "learning_rate": 7.4e-06, "inf_nan_count": 0 }, { "step": 18250, "loss": 6.4155, "learning_rate": 7.29e-06, "inf_nan_count": 0 }, { "step": 18275, "loss": 6.4584, "learning_rate": 7.19e-06, "inf_nan_count": 0 }, { "step": 18300, "loss": 6.3637, "learning_rate": 7.08e-06, "inf_nan_count": 0 }, { "step": 18325, "loss": 6.3583, "learning_rate": 6.98e-06, "inf_nan_count": 0 }, { "step": 18350, "loss": 6.4469, "learning_rate": 6.88e-06, "inf_nan_count": 0 }, { "step": 18375, "loss": 6.3768, "learning_rate": 6.77e-06, "inf_nan_count": 0 }, { "step": 18400, "loss": 6.3179, "learning_rate": 6.67e-06, "inf_nan_count": 0 }, { "step": 18425, "loss": 6.4046, "learning_rate": 6.56e-06, "inf_nan_count": 0 }, { "step": 18450, "loss": 6.3435, "learning_rate": 6.46e-06, "inf_nan_count": 0 }, { "step": 18475, "loss": 6.3454, "learning_rate": 6.35e-06, "inf_nan_count": 0 }, { "step": 18500, "loss": 6.3922, "learning_rate": 6.25e-06, "inf_nan_count": 0 }, { "step": 18525, "loss": 6.3459, "learning_rate": 6.15e-06, "inf_nan_count": 0 }, { "step": 18550, "loss": 6.3591, "learning_rate": 6.04e-06, "inf_nan_count": 0 }, { "step": 18575, "loss": 6.4337, "learning_rate": 5.94e-06, "inf_nan_count": 0 }, { "step": 18600, "loss": 6.3962, "learning_rate": 5.83e-06, "inf_nan_count": 0 }, { "step": 18625, "loss": 6.3425, "learning_rate": 5.73e-06, "inf_nan_count": 0 }, { "step": 18650, "loss": 6.4022, "learning_rate": 5.63e-06, "inf_nan_count": 0 }, { "step": 18675, "loss": 6.4513, "learning_rate": 5.52e-06, "inf_nan_count": 0 }, { "step": 18700, "loss": 6.4284, "learning_rate": 5.42e-06, "inf_nan_count": 0 }, { "step": 18725, "loss": 6.3879, "learning_rate": 5.31e-06, "inf_nan_count": 0 }, { "step": 18750, "loss": 6.4009, "learning_rate": 5.21e-06, "inf_nan_count": 0 }, { "step": 18775, "loss": 6.3713, "learning_rate": 5.1e-06, "inf_nan_count": 0 }, { "step": 18800, "loss": 6.3752, "learning_rate": 5e-06, "inf_nan_count": 0 }, { "step": 18825, "loss": 6.4265, "learning_rate": 4.9e-06, "inf_nan_count": 0 }, { "step": 18850, "loss": 6.3709, "learning_rate": 4.79e-06, "inf_nan_count": 0 }, { "step": 18875, "loss": 6.3316, "learning_rate": 4.69e-06, "inf_nan_count": 0 }, { "step": 18900, "loss": 6.4479, "learning_rate": 4.58e-06, "inf_nan_count": 0 }, { "step": 18925, "loss": 6.4247, "learning_rate": 4.48e-06, "inf_nan_count": 0 }, { "step": 18950, "loss": 6.4126, "learning_rate": 4.37e-06, "inf_nan_count": 0 }, { "step": 18975, "loss": 6.3489, "learning_rate": 4.27e-06, "inf_nan_count": 0 }, { "step": 19000, "loss": 6.325, "learning_rate": 4.17e-06, "inf_nan_count": 0 }, { "step": 19025, "loss": 6.3306, "learning_rate": 4.06e-06, "inf_nan_count": 0 }, { "step": 19050, "loss": 6.387, "learning_rate": 3.96e-06, "inf_nan_count": 0 }, { "step": 19075, "loss": 6.4133, "learning_rate": 3.85e-06, "inf_nan_count": 0 }, { "step": 19100, "loss": 6.334, "learning_rate": 3.75e-06, "inf_nan_count": 0 }, { "step": 19125, "loss": 6.3034, "learning_rate": 3.65e-06, "inf_nan_count": 0 }, { "step": 19150, "loss": 6.4097, "learning_rate": 3.54e-06, "inf_nan_count": 0 }, { "step": 19175, "loss": 6.442, "learning_rate": 3.44e-06, "inf_nan_count": 0 }, { "step": 19200, "loss": 6.3756, "learning_rate": 3.33e-06, "inf_nan_count": 0 }, { "step": 19225, "loss": 6.4037, "learning_rate": 3.23e-06, "inf_nan_count": 0 }, { "step": 19250, "loss": 6.3974, "learning_rate": 3.13e-06, "inf_nan_count": 0 }, { "step": 19275, "loss": 6.3933, "learning_rate": 3.02e-06, "inf_nan_count": 0 }, { "step": 19300, "loss": 6.3269, "learning_rate": 2.92e-06, "inf_nan_count": 0 }, { "step": 19325, "loss": 6.3907, "learning_rate": 2.81e-06, "inf_nan_count": 0 }, { "step": 19350, "loss": 6.3955, "learning_rate": 2.71e-06, "inf_nan_count": 0 }, { "step": 19375, "loss": 6.3972, "learning_rate": 2.6e-06, "inf_nan_count": 0 }, { "step": 19400, "loss": 6.3896, "learning_rate": 2.5e-06, "inf_nan_count": 0 }, { "step": 19425, "loss": 6.3425, "learning_rate": 2.4e-06, "inf_nan_count": 0 }, { "step": 19450, "loss": 6.3587, "learning_rate": 2.29e-06, "inf_nan_count": 0 }, { "step": 19475, "loss": 6.4179, "learning_rate": 2.19e-06, "inf_nan_count": 0 }, { "step": 19500, "loss": 6.4192, "learning_rate": 2.08e-06, "inf_nan_count": 0 }, { "step": 19525, "loss": 6.4252, "learning_rate": 1.98e-06, "inf_nan_count": 0 }, { "step": 19550, "loss": 6.3349, "learning_rate": 1.88e-06, "inf_nan_count": 0 }, { "step": 19575, "loss": 6.4042, "learning_rate": 1.77e-06, "inf_nan_count": 0 }, { "step": 19600, "loss": 6.3567, "learning_rate": 1.67e-06, "inf_nan_count": 0 }, { "step": 19625, "loss": 6.3912, "learning_rate": 1.56e-06, "inf_nan_count": 0 }, { "step": 19650, "loss": 6.3113, "learning_rate": 1.46e-06, "inf_nan_count": 0 }, { "step": 19675, "loss": 6.3756, "learning_rate": 1.35e-06, "inf_nan_count": 0 }, { "step": 19700, "loss": 6.385, "learning_rate": 1.25e-06, "inf_nan_count": 0 }, { "step": 19725, "loss": 6.3631, "learning_rate": 1.15e-06, "inf_nan_count": 0 }, { "step": 19750, "loss": 6.4564, "learning_rate": 1.04e-06, "inf_nan_count": 0 }, { "step": 19775, "loss": 6.3258, "learning_rate": 9.38e-07, "inf_nan_count": 0 }, { "step": 19800, "loss": 6.4682, "learning_rate": 8.33e-07, "inf_nan_count": 0 }, { "step": 19825, "loss": 6.4421, "learning_rate": 7.29e-07, "inf_nan_count": 0 }, { "step": 19850, "loss": 6.4342, "learning_rate": 6.25e-07, "inf_nan_count": 0 }, { "step": 19875, "loss": 6.4182, "learning_rate": 5.21e-07, "inf_nan_count": 0 }, { "step": 19900, "loss": 6.3203, "learning_rate": 4.17e-07, "inf_nan_count": 0 }, { "step": 19925, "loss": 6.4339, "learning_rate": 3.13e-07, "inf_nan_count": 0 }, { "step": 19950, "loss": 6.4095, "learning_rate": 2.08e-07, "inf_nan_count": 0 }, { "step": 19975, "loss": 6.4814, "learning_rate": 1.04e-07, "inf_nan_count": 0 } ], "evaluation_results": [ { "step": 1000, "paloma": 7.125172406420199e+27 }, { "step": 1500, "paloma": 6.5469212698356e+18 }, { "step": 2000, "paloma": 5.118641309912889e+18 }, { "step": 2500, "paloma": 3.37924315167126e+18 }, { "step": 3000, "paloma": 6.892747900243237e+18 }, { "step": 3500, "paloma": 2.0436832271954907e+19 }, { "step": 4000, "paloma": 4.1410268232311005e+19 }, { "step": 4500, "paloma": 3.4524340411684053e+19 }, { "step": 5000, "paloma": 2.320698426399461e+19 }, { "step": 5500, "paloma": 3.1834097890526753e+19 }, { "step": 6000, "paloma": 4.457139025979801e+19 }, { "step": 6500, "paloma": 7.3062353841856406e+19 }, { "step": 7000, "paloma": 1.2357969480287024e+20 }, { "step": 7500, "paloma": 2.7199371732053928e+20 }, { "step": 8000, "paloma": 7.181862506006892e+20 }, { "step": 8500, "paloma": 1.5123285241831744e+21 }, { "step": 9000, "paloma": 3.573074534351724e+21 }, { "step": 9500, "paloma": 7.403721262078652e+21 }, { "step": 10000, "paloma": 1.0650515380055143e+22 }, { "step": 10500, "paloma": 2.1077589258137904e+22 }, { "step": 11000, "paloma": 2.712416409262884e+22 }, { "step": 11500, "paloma": 4.877238989481918e+22 }, { "step": 12000, "paloma": 7.219509956260661e+22 }, { "step": 12500, "paloma": 1.1729325953411656e+23 }, { "step": 13000, "paloma": 1.729306754923583e+23 }, { "step": 13500, "paloma": 2.4018454768029128e+23 }, { "step": 14000, "paloma": 3.247328955167052e+23 }, { "step": 14500, "paloma": 4.43239578722337e+23 }, { "step": 15000, "paloma": 5.215164570276226e+23 }, { "step": 15500, "paloma": 6.102665947946271e+23 }, { "step": 16000, "paloma": 8.874629945146669e+23 }, { "step": 16500, "paloma": 9.981607121011733e+23 }, { "step": 17000, "paloma": 1.1075349421086151e+24 }, { "step": 17500, "paloma": 1.1064948792133394e+24 }, { "step": 18000, "paloma": 1.340918782615931e+24 }, { "step": 18500, "paloma": 1.4325241176004668e+24 }, { "step": 19000, "paloma": 1.5360601246943468e+24 }, { "step": 19500, "paloma": 1.6346615942991742e+24 }, { "step": 20000, "paloma": 1.645368302099182e+24 } ], "config": { "d_model": 96, "n_layers": 12, "max_seq_len": 2048, "vocab_size": 50304, "lr": 5e-05, "max_steps": 20000, "batch_size": 1 } }, { "run_name": "pico-decoder-tiny-dolma29k-v1", "log_file": "log_20250828_225300.log", "training_metrics": [ { "step": 1000, "loss": 7.7657, "learning_rate": 0.00012, "inf_nan_count": 0 }, { "step": 1100, "loss": 7.6733, "learning_rate": 0.000132, "inf_nan_count": 0 }, { "step": 1200, "loss": 7.5969, "learning_rate": 0.000144, "inf_nan_count": 0 }, { "step": 1300, "loss": 7.4765, "learning_rate": 0.000156, "inf_nan_count": 0 }, { "step": 1400, "loss": 7.3686, "learning_rate": 0.000168, "inf_nan_count": 0 }, { "step": 1500, "loss": 7.3251, "learning_rate": 0.00018, "inf_nan_count": 0 }, { "step": 1600, "loss": 7.184, "learning_rate": 0.000192, "inf_nan_count": 0 }, { "step": 1700, "loss": 7.1116, "learning_rate": 0.000204, "inf_nan_count": 0 }, { "step": 1800, "loss": 7.0565, "learning_rate": 0.000216, "inf_nan_count": 0 }, { "step": 1900, "loss": 6.9964, "learning_rate": 0.000228, "inf_nan_count": 0 }, { "step": 2000, "loss": 6.969, "learning_rate": 0.00024, "inf_nan_count": 0 }, { "step": 2100, "loss": 6.884, "learning_rate": 0.000252, "inf_nan_count": 0 }, { "step": 2200, "loss": 6.8334, "learning_rate": 0.000264, "inf_nan_count": 0 }, { "step": 2300, "loss": 6.815, "learning_rate": 0.000276, "inf_nan_count": 0 }, { "step": 2400, "loss": 6.7519, "learning_rate": 0.000288, "inf_nan_count": 0 }, { "step": 2500, "loss": 6.6908, "learning_rate": 0.0003, "inf_nan_count": 0 }, { "step": 2600, "loss": 6.6351, "learning_rate": 0.0003, "inf_nan_count": 0 }, { "step": 2700, "loss": 6.5568, "learning_rate": 0.0003, "inf_nan_count": 0 }, { "step": 2800, "loss": 6.5799, "learning_rate": 0.0003, "inf_nan_count": 0 }, { "step": 2900, "loss": 6.5467, "learning_rate": 0.000299, "inf_nan_count": 0 }, { "step": 3000, "loss": 6.4865, "learning_rate": 0.000299, "inf_nan_count": 0 }, { "step": 3100, "loss": 6.4604, "learning_rate": 0.000299, "inf_nan_count": 0 }, { "step": 3200, "loss": 6.4205, "learning_rate": 0.000299, "inf_nan_count": 0 }, { "step": 3300, "loss": 6.4127, "learning_rate": 0.000299, "inf_nan_count": 0 }, { "step": 3400, "loss": 6.3692, "learning_rate": 0.000299, "inf_nan_count": 0 }, { "step": 3500, "loss": 6.3761, "learning_rate": 0.000298, "inf_nan_count": 0 }, { "step": 3600, "loss": 6.2796, "learning_rate": 0.000298, "inf_nan_count": 0 }, { "step": 3700, "loss": 6.2988, "learning_rate": 0.000298, "inf_nan_count": 0 }, { "step": 3800, "loss": 6.2673, "learning_rate": 0.000298, "inf_nan_count": 0 }, { "step": 3900, "loss": 6.2715, "learning_rate": 0.000298, "inf_nan_count": 0 }, { "step": 4000, "loss": 6.189, "learning_rate": 0.000298, "inf_nan_count": 0 }, { "step": 4100, "loss": 6.1832, "learning_rate": 0.000298, "inf_nan_count": 0 }, { "step": 4200, "loss": 6.1553, "learning_rate": 0.000297, "inf_nan_count": 0 }, { "step": 4300, "loss": 6.1629, "learning_rate": 0.000297, "inf_nan_count": 0 }, { "step": 4400, "loss": 6.1061, "learning_rate": 0.000297, "inf_nan_count": 0 }, { "step": 4500, "loss": 6.1601, "learning_rate": 0.000297, "inf_nan_count": 0 }, { "step": 4600, "loss": 6.0963, "learning_rate": 0.000297, "inf_nan_count": 0 }, { "step": 4700, "loss": 6.078, "learning_rate": 0.000297, "inf_nan_count": 0 }, { "step": 4800, "loss": 6.0835, "learning_rate": 0.000297, "inf_nan_count": 0 }, { "step": 4900, "loss": 6.0519, "learning_rate": 0.000296, "inf_nan_count": 0 }, { "step": 5000, "loss": 6.0661, "learning_rate": 0.000296, "inf_nan_count": 0 }, { "step": 5100, "loss": 6.0121, "learning_rate": 0.000296, "inf_nan_count": 0 }, { "step": 5200, "loss": 6.0544, "learning_rate": 0.000296, "inf_nan_count": 0 }, { "step": 5300, "loss": 6.0224, "learning_rate": 0.000296, "inf_nan_count": 0 }, { "step": 5400, "loss": 5.9831, "learning_rate": 0.000296, "inf_nan_count": 0 }, { "step": 5500, "loss": 5.9553, "learning_rate": 0.000295, "inf_nan_count": 0 }, { "step": 5600, "loss": 5.9493, "learning_rate": 0.000295, "inf_nan_count": 0 }, { "step": 5700, "loss": 5.9943, "learning_rate": 0.000295, "inf_nan_count": 0 }, { "step": 5800, "loss": 5.963, "learning_rate": 0.000295, "inf_nan_count": 0 }, { "step": 5900, "loss": 5.9349, "learning_rate": 0.000295, "inf_nan_count": 0 }, { "step": 6000, "loss": 5.9087, "learning_rate": 0.000295, "inf_nan_count": 0 }, { "step": 6100, "loss": 5.8818, "learning_rate": 0.000295, "inf_nan_count": 0 }, { "step": 6200, "loss": 5.8535, "learning_rate": 0.000294, "inf_nan_count": 0 }, { "step": 6300, "loss": 5.8896, "learning_rate": 0.000294, "inf_nan_count": 0 }, { "step": 6400, "loss": 5.9007, "learning_rate": 0.000294, "inf_nan_count": 0 }, { "step": 6500, "loss": 5.8617, "learning_rate": 0.000294, "inf_nan_count": 0 }, { "step": 6600, "loss": 5.8201, "learning_rate": 0.000294, "inf_nan_count": 0 }, { "step": 6700, "loss": 5.8544, "learning_rate": 0.000294, "inf_nan_count": 0 }, { "step": 6800, "loss": 5.8532, "learning_rate": 0.000293, "inf_nan_count": 0 }, { "step": 6900, "loss": 5.795, "learning_rate": 0.000293, "inf_nan_count": 0 }, { "step": 7000, "loss": 5.8146, "learning_rate": 0.000293, "inf_nan_count": 0 }, { "step": 7100, "loss": 5.793, "learning_rate": 0.000293, "inf_nan_count": 0 }, { "step": 7200, "loss": 5.7827, "learning_rate": 0.000293, "inf_nan_count": 0 }, { "step": 7300, "loss": 5.7816, "learning_rate": 0.000293, "inf_nan_count": 0 }, { "step": 7400, "loss": 5.73, "learning_rate": 0.000293, "inf_nan_count": 0 }, { "step": 7500, "loss": 5.767, "learning_rate": 0.000292, "inf_nan_count": 0 }, { "step": 7600, "loss": 5.745, "learning_rate": 0.000292, "inf_nan_count": 0 }, { "step": 7700, "loss": 5.7499, "learning_rate": 0.000292, "inf_nan_count": 0 }, { "step": 7800, "loss": 5.7233, "learning_rate": 0.000292, "inf_nan_count": 0 }, { "step": 7900, "loss": 5.7219, "learning_rate": 0.000292, "inf_nan_count": 0 }, { "step": 8000, "loss": 5.7523, "learning_rate": 0.000292, "inf_nan_count": 0 }, { "step": 8100, "loss": 5.7145, "learning_rate": 0.000291, "inf_nan_count": 0 }, { "step": 8200, "loss": 5.7469, "learning_rate": 0.000291, "inf_nan_count": 0 }, { "step": 8300, "loss": 5.7363, "learning_rate": 0.000291, "inf_nan_count": 0 }, { "step": 8400, "loss": 5.6938, "learning_rate": 0.000291, "inf_nan_count": 0 }, { "step": 8500, "loss": 5.6994, "learning_rate": 0.000291, "inf_nan_count": 0 }, { "step": 8600, "loss": 5.6583, "learning_rate": 0.000291, "inf_nan_count": 0 }, { "step": 8700, "loss": 5.6885, "learning_rate": 0.000291, "inf_nan_count": 0 }, { "step": 8800, "loss": 5.6313, "learning_rate": 0.00029, "inf_nan_count": 0 }, { "step": 8900, "loss": 5.6314, "learning_rate": 0.00029, "inf_nan_count": 0 }, { "step": 9000, "loss": 5.6501, "learning_rate": 0.00029, "inf_nan_count": 0 }, { "step": 9100, "loss": 5.6357, "learning_rate": 0.00029, "inf_nan_count": 0 }, { "step": 9200, "loss": 5.6045, "learning_rate": 0.00029, "inf_nan_count": 0 }, { "step": 9300, "loss": 5.6405, "learning_rate": 0.00029, "inf_nan_count": 0 }, { "step": 9400, "loss": 5.6241, "learning_rate": 0.00029, "inf_nan_count": 0 }, { "step": 9500, "loss": 5.6247, "learning_rate": 0.000289, "inf_nan_count": 0 }, { "step": 9600, "loss": 5.5983, "learning_rate": 0.000289, "inf_nan_count": 0 }, { "step": 9700, "loss": 5.5978, "learning_rate": 0.000289, "inf_nan_count": 0 }, { "step": 9800, "loss": 5.5746, "learning_rate": 0.000289, "inf_nan_count": 0 } ], "evaluation_results": [ { "step": 1000, "paloma": 2.5468931158531133e+19 }, { "step": 2000, "paloma": 3.627192449295412e+21 }, { "step": 3000, "paloma": 9.90975658825673e+22 }, { "step": 4000, "paloma": 2.6252526658823776e+24 }, { "step": 5000, "paloma": 7.294956881845611e+25 }, { "step": 6000, "paloma": 1.6856570425562805e+27 }, { "step": 7000, "paloma": 9.22180682233585e+28 }, { "step": 8000, "paloma": 3.1300823362207656e+29 }, { "step": 9000, "paloma": 4.983924509492406e+30 } ], "config": { "d_model": 96, "n_layers": 12, "max_seq_len": 2048, "vocab_size": 50304, "lr": 0.0003, "max_steps": 200000, "batch_size": 1 } }, { "run_name": "pico-decoder-tiny-dolma-teensy-v0", "log_file": "log_20250828_210922.log", "training_metrics": [ { "step": 0, "loss": 10.9914, "learning_rate": 0.0, "inf_nan_count": 0 } ], "evaluation_results": [ { "step": 0, "paloma": 59434.76600609756 }, { "step": 27, "paloma": 59120.39268292683 } ], "config": { "d_model": 96, "n_layers": 12, "max_seq_len": 2048, "vocab_size": 50304, "lr": 0.0003, "max_steps": 200000, "batch_size": 8 } }, { "run_name": "pico-decoder-tiny-dolma-teensy-v1", "log_file": "log_20250828_220514.log", "training_metrics": [ { "step": 0, "loss": 10.9886, "learning_rate": 0.0, "inf_nan_count": 0 }, { "step": 100, "loss": 10.9373, "learning_rate": 1.2e-05, "inf_nan_count": 0 }, { "step": 200, "loss": 10.5423, "learning_rate": 2.4e-05, "inf_nan_count": 0 }, { "step": 300, "loss": 9.9452, "learning_rate": 3.6e-05, "inf_nan_count": 0 }, { "step": 400, "loss": 9.449, "learning_rate": 4.8e-05, "inf_nan_count": 0 }, { "step": 500, "loss": 8.8455, "learning_rate": 6e-05, "inf_nan_count": 0 }, { "step": 600, "loss": 8.1482, "learning_rate": 7.2e-05, "inf_nan_count": 0 }, { "step": 700, "loss": 7.4303, "learning_rate": 8.4e-05, "inf_nan_count": 0 }, { "step": 800, "loss": 7.0363, "learning_rate": 9.6e-05, "inf_nan_count": 0 }, { "step": 900, "loss": 6.9702, "learning_rate": 0.000108, "inf_nan_count": 0 }, { "step": 1000, "loss": 6.8975, "learning_rate": 0.00012, "inf_nan_count": 0 }, { "step": 1100, "loss": 6.892, "learning_rate": 0.000132, "inf_nan_count": 0 }, { "step": 1200, "loss": 6.6684, "learning_rate": 0.000144, "inf_nan_count": 0 }, { "step": 1300, "loss": 6.4754, "learning_rate": 0.000156, "inf_nan_count": 0 }, { "step": 1400, "loss": 6.3649, "learning_rate": 0.000168, "inf_nan_count": 0 }, { "step": 1500, "loss": 6.2981, "learning_rate": 0.00018, "inf_nan_count": 0 }, { "step": 1600, "loss": 6.1551, "learning_rate": 0.000192, "inf_nan_count": 0 }, { "step": 1700, "loss": 5.9163, "learning_rate": 0.000204, "inf_nan_count": 0 } ], "evaluation_results": [ { "step": 1000, "paloma": 9.54583880403771e+19 }, { "step": 1755, "paloma": 2.945795672816324e+21 } ], "config": { "d_model": 96, "n_layers": 12, "max_seq_len": 2048, "vocab_size": 50304, "lr": 0.0003, "max_steps": 200000, "batch_size": 4 } }, { "run_name": "pico-decoder-tiny-dolma5M-v1", "log_file": "log_20250830_014108.log", "training_metrics": [ { "step": 32000, "loss": 6.3376, "learning_rate": 7.32e-06, "inf_nan_count": 0 }, { "step": 32025, "loss": 6.1999, "learning_rate": 7.28e-06, "inf_nan_count": 0 }, { "step": 32050, "loss": 6.1488, "learning_rate": 7.24e-06, "inf_nan_count": 0 }, { "step": 32075, "loss": 6.046, "learning_rate": 7.19e-06, "inf_nan_count": 0 } ], "evaluation_results": [ { "step": 32000, "paloma": 2.977755235898109e+26 } ], "config": { "d_model": 96, "n_layers": 12, "max_seq_len": 2048, "vocab_size": 50304, "lr": 5e-05, "max_steps": 20000, "batch_size": 1 } } ], "summary": { "total_runs": 6, "run_names": [ "pico-decoder-tiny-dolma29k-v2", "pico-decoder-tiny-dolma29k-v3", "pico-decoder-tiny-dolma29k-v1", "pico-decoder-tiny-dolma-teensy-v0", "pico-decoder-tiny-dolma-teensy-v1", "pico-decoder-tiny-dolma5M-v1" ] } }