diff --git "a/plots/data.json" "b/plots/data.json" deleted file mode 100644--- "a/plots/data.json" +++ /dev/null @@ -1,6367 +0,0 @@ -{ - "runs": [ - { - "run_name": "pico-decoder-tiny-dolma29k-v2", - "log_file": "log_20250829_003838.log", - "training_metrics": [ - { - "step": 0, - "loss": 10.9848, - "learning_rate": 0.0, - "inf_nan_count": 0 - }, - { - "step": 50, - "loss": 11.0005, - "learning_rate": 1e-06, - "inf_nan_count": 0 - }, - { - "step": 100, - "loss": 10.9918, - "learning_rate": 2e-06, - "inf_nan_count": 0 - }, - { - "step": 150, - "loss": 10.9776, - "learning_rate": 3e-06, - "inf_nan_count": 0 - }, - { - "step": 200, - "loss": 10.9569, - "learning_rate": 4e-06, - "inf_nan_count": 0 - }, - { - "step": 250, - "loss": 10.9255, - "learning_rate": 5e-06, - "inf_nan_count": 0 - }, - { - "step": 300, - "loss": 10.8883, - "learning_rate": 6e-06, - "inf_nan_count": 0 - }, - { - "step": 350, - "loss": 10.8249, - "learning_rate": 7e-06, - "inf_nan_count": 0 - }, - { - "step": 400, - "loss": 10.7344, - "learning_rate": 8e-06, - "inf_nan_count": 0 - }, - { - "step": 450, - "loss": 10.6177, - "learning_rate": 9e-06, - "inf_nan_count": 0 - }, - { - "step": 500, - "loss": 10.5025, - "learning_rate": 1e-05, - "inf_nan_count": 0 - }, - { - "step": 550, - "loss": 10.3986, - "learning_rate": 1.1e-05, - "inf_nan_count": 0 - }, - { - "step": 600, - "loss": 10.3079, - "learning_rate": 1.2e-05, - "inf_nan_count": 0 - }, - { - "step": 650, - "loss": 10.2142, - "learning_rate": 1.3e-05, - "inf_nan_count": 0 - }, - { - "step": 700, - "loss": 10.1146, - "learning_rate": 1.4e-05, - "inf_nan_count": 0 - }, - { - "step": 750, - "loss": 10.0398, - "learning_rate": 1.5e-05, - "inf_nan_count": 0 - }, - { - "step": 800, - "loss": 9.9311, - "learning_rate": 1.6e-05, - "inf_nan_count": 0 - }, - { - "step": 850, - "loss": 9.8431, - "learning_rate": 1.7e-05, - "inf_nan_count": 0 - }, - { - "step": 900, - "loss": 9.7453, - "learning_rate": 1.8e-05, - "inf_nan_count": 0 - }, - { - "step": 950, - "loss": 9.6527, - "learning_rate": 1.9e-05, - "inf_nan_count": 0 - }, - { - "step": 1000, - "loss": 9.5691, - "learning_rate": 2e-05, - "inf_nan_count": 0 - }, - { - "step": 1050, - "loss": 9.46, - "learning_rate": 2.1e-05, - "inf_nan_count": 0 - }, - { - "step": 1100, - "loss": 9.3525, - "learning_rate": 2.2e-05, - "inf_nan_count": 0 - }, - { - "step": 1150, - "loss": 9.2715, - "learning_rate": 2.3e-05, - "inf_nan_count": 0 - }, - { - "step": 1200, - "loss": 9.1618, - "learning_rate": 2.4e-05, - "inf_nan_count": 0 - }, - { - "step": 1250, - "loss": 9.0547, - "learning_rate": 2.5e-05, - "inf_nan_count": 0 - }, - { - "step": 1300, - "loss": 8.955, - "learning_rate": 2.6e-05, - "inf_nan_count": 0 - }, - { - "step": 1350, - "loss": 8.8251, - "learning_rate": 2.7e-05, - "inf_nan_count": 0 - }, - { - "step": 1400, - "loss": 8.7711, - "learning_rate": 2.8e-05, - "inf_nan_count": 0 - }, - { - "step": 1450, - "loss": 8.6834, - "learning_rate": 2.9e-05, - "inf_nan_count": 0 - }, - { - "step": 1500, - "loss": 8.5638, - "learning_rate": 3e-05, - "inf_nan_count": 0 - }, - { - "step": 1550, - "loss": 8.4572, - "learning_rate": 3.1e-05, - "inf_nan_count": 0 - }, - { - "step": 1600, - "loss": 8.394, - "learning_rate": 3.2e-05, - "inf_nan_count": 0 - }, - { - "step": 1650, - "loss": 8.2973, - "learning_rate": 3.3e-05, - "inf_nan_count": 0 - }, - { - "step": 1700, - "loss": 8.2264, - "learning_rate": 3.4e-05, - "inf_nan_count": 0 - }, - { - "step": 1750, - "loss": 8.1672, - "learning_rate": 3.5e-05, - "inf_nan_count": 0 - }, - { - "step": 1800, - "loss": 8.0695, - "learning_rate": 3.6e-05, - "inf_nan_count": 0 - }, - { - "step": 1850, - "loss": 8.0299, - "learning_rate": 3.7e-05, - "inf_nan_count": 0 - }, - { - "step": 1900, - "loss": 7.9883, - "learning_rate": 3.8e-05, - "inf_nan_count": 0 - }, - { - "step": 1950, - "loss": 7.9429, - "learning_rate": 3.9e-05, - "inf_nan_count": 0 - }, - { - "step": 2000, - "loss": 7.8447, - "learning_rate": 4e-05, - "inf_nan_count": 0 - }, - { - "step": 2050, - "loss": 7.838, - "learning_rate": 4.1e-05, - "inf_nan_count": 0 - }, - { - "step": 2100, - "loss": 7.7671, - "learning_rate": 4.2e-05, - "inf_nan_count": 0 - }, - { - "step": 2150, - "loss": 7.7637, - "learning_rate": 4.3e-05, - "inf_nan_count": 0 - }, - { - "step": 2200, - "loss": 7.706, - "learning_rate": 4.4e-05, - "inf_nan_count": 0 - }, - { - "step": 2250, - "loss": 7.7607, - "learning_rate": 4.5e-05, - "inf_nan_count": 0 - }, - { - "step": 2300, - "loss": 7.7076, - "learning_rate": 4.6e-05, - "inf_nan_count": 0 - }, - { - "step": 2350, - "loss": 7.6787, - "learning_rate": 4.7e-05, - "inf_nan_count": 0 - }, - { - "step": 2400, - "loss": 7.6446, - "learning_rate": 4.8e-05, - "inf_nan_count": 0 - }, - { - "step": 2450, - "loss": 7.5999, - "learning_rate": 4.9e-05, - "inf_nan_count": 0 - }, - { - "step": 2500, - "loss": 7.6154, - "learning_rate": 5e-05, - "inf_nan_count": 0 - }, - { - "step": 2550, - "loss": 7.5627, - "learning_rate": 5.1e-05, - "inf_nan_count": 0 - }, - { - "step": 2600, - "loss": 7.5747, - "learning_rate": 5.2e-05, - "inf_nan_count": 0 - }, - { - "step": 2650, - "loss": 7.5358, - "learning_rate": 5.3e-05, - "inf_nan_count": 0 - }, - { - "step": 2700, - "loss": 7.5148, - "learning_rate": 5.4e-05, - "inf_nan_count": 0 - }, - { - "step": 2750, - "loss": 7.4874, - "learning_rate": 5.5e-05, - "inf_nan_count": 0 - }, - { - "step": 2800, - "loss": 7.4438, - "learning_rate": 5.6e-05, - "inf_nan_count": 0 - }, - { - "step": 2850, - "loss": 7.4772, - "learning_rate": 5.7e-05, - "inf_nan_count": 0 - }, - { - "step": 2900, - "loss": 7.4135, - "learning_rate": 5.8e-05, - "inf_nan_count": 0 - }, - { - "step": 2950, - "loss": 7.3929, - "learning_rate": 5.9e-05, - "inf_nan_count": 0 - }, - { - "step": 3000, - "loss": 7.3566, - "learning_rate": 6e-05, - "inf_nan_count": 0 - }, - { - "step": 3050, - "loss": 7.3318, - "learning_rate": 6.1e-05, - "inf_nan_count": 0 - }, - { - "step": 3100, - "loss": 7.3114, - "learning_rate": 6.2e-05, - "inf_nan_count": 0 - }, - { - "step": 3150, - "loss": 7.2734, - "learning_rate": 6.3e-05, - "inf_nan_count": 0 - }, - { - "step": 3200, - "loss": 7.322, - "learning_rate": 6.4e-05, - "inf_nan_count": 0 - }, - { - "step": 3250, - "loss": 7.2621, - "learning_rate": 6.5e-05, - "inf_nan_count": 0 - }, - { - "step": 3300, - "loss": 7.2257, - "learning_rate": 6.6e-05, - "inf_nan_count": 0 - }, - { - "step": 3350, - "loss": 7.2447, - "learning_rate": 6.7e-05, - "inf_nan_count": 0 - }, - { - "step": 3400, - "loss": 7.2344, - "learning_rate": 6.8e-05, - "inf_nan_count": 0 - }, - { - "step": 3450, - "loss": 7.1488, - "learning_rate": 6.9e-05, - "inf_nan_count": 0 - }, - { - "step": 3500, - "loss": 7.1797, - "learning_rate": 7e-05, - "inf_nan_count": 0 - }, - { - "step": 3550, - "loss": 7.1737, - "learning_rate": 7.1e-05, - "inf_nan_count": 0 - }, - { - "step": 3600, - "loss": 7.1204, - "learning_rate": 7.2e-05, - "inf_nan_count": 0 - }, - { - "step": 3650, - "loss": 7.1102, - "learning_rate": 7.3e-05, - "inf_nan_count": 0 - }, - { - "step": 3700, - "loss": 7.0845, - "learning_rate": 7.4e-05, - "inf_nan_count": 0 - }, - { - "step": 3750, - "loss": 7.0858, - "learning_rate": 7.5e-05, - "inf_nan_count": 0 - }, - { - "step": 3800, - "loss": 7.0362, - "learning_rate": 7.6e-05, - "inf_nan_count": 0 - }, - { - "step": 3850, - "loss": 7.0603, - "learning_rate": 7.7e-05, - "inf_nan_count": 0 - }, - { - "step": 3900, - "loss": 7.0172, - "learning_rate": 7.8e-05, - "inf_nan_count": 0 - }, - { - "step": 3950, - "loss": 6.9948, - "learning_rate": 7.9e-05, - "inf_nan_count": 0 - }, - { - "step": 4000, - "loss": 6.9909, - "learning_rate": 8e-05, - "inf_nan_count": 0 - }, - { - "step": 4050, - "loss": 6.9477, - "learning_rate": 8.1e-05, - "inf_nan_count": 0 - }, - { - "step": 4100, - "loss": 6.9651, - "learning_rate": 8.2e-05, - "inf_nan_count": 0 - }, - { - "step": 4150, - "loss": 6.9149, - "learning_rate": 8.3e-05, - "inf_nan_count": 0 - }, - { - "step": 4200, - "loss": 6.893, - "learning_rate": 8.4e-05, - "inf_nan_count": 0 - }, - { - "step": 4250, - "loss": 6.9227, - "learning_rate": 8.5e-05, - "inf_nan_count": 0 - }, - { - "step": 4300, - "loss": 6.879, - "learning_rate": 8.6e-05, - "inf_nan_count": 0 - }, - { - "step": 4350, - "loss": 6.8649, - "learning_rate": 8.7e-05, - "inf_nan_count": 0 - }, - { - "step": 4400, - "loss": 6.8305, - "learning_rate": 8.8e-05, - "inf_nan_count": 0 - }, - { - "step": 4450, - "loss": 6.8085, - "learning_rate": 8.9e-05, - "inf_nan_count": 0 - }, - { - "step": 4500, - "loss": 6.8315, - "learning_rate": 9e-05, - "inf_nan_count": 0 - }, - { - "step": 4550, - "loss": 6.7885, - "learning_rate": 9.1e-05, - "inf_nan_count": 0 - }, - { - "step": 4600, - "loss": 6.7805, - "learning_rate": 9.2e-05, - "inf_nan_count": 0 - }, - { - "step": 4650, - "loss": 6.7737, - "learning_rate": 9.3e-05, - "inf_nan_count": 0 - }, - { - "step": 4700, - "loss": 6.7649, - "learning_rate": 9.4e-05, - "inf_nan_count": 0 - }, - { - "step": 4750, - "loss": 6.7562, - "learning_rate": 9.5e-05, - "inf_nan_count": 0 - }, - { - "step": 4800, - "loss": 6.7347, - "learning_rate": 9.6e-05, - "inf_nan_count": 0 - }, - { - "step": 4850, - "loss": 6.7161, - "learning_rate": 9.7e-05, - "inf_nan_count": 0 - }, - { - "step": 4900, - "loss": 6.6889, - "learning_rate": 9.8e-05, - "inf_nan_count": 0 - }, - { - "step": 4950, - "loss": 6.7299, - "learning_rate": 9.9e-05, - "inf_nan_count": 0 - }, - { - "step": 5000, - "loss": 6.6605, - "learning_rate": 0.0001, - "inf_nan_count": 0 - }, - { - "step": 5050, - "loss": 6.6552, - "learning_rate": 0.0001, - "inf_nan_count": 0 - }, - { - "step": 5100, - "loss": 6.7038, - "learning_rate": 9.99e-05, - "inf_nan_count": 0 - }, - { - "step": 5150, - "loss": 6.6452, - "learning_rate": 9.99e-05, - "inf_nan_count": 0 - }, - { - "step": 5200, - "loss": 6.6522, - "learning_rate": 9.99e-05, - "inf_nan_count": 0 - }, - { - "step": 5250, - "loss": 6.627, - "learning_rate": 9.99e-05, - "inf_nan_count": 0 - }, - { - "step": 5300, - "loss": 6.5733, - "learning_rate": 9.98e-05, - "inf_nan_count": 0 - }, - { - "step": 5350, - "loss": 6.5833, - "learning_rate": 9.98e-05, - "inf_nan_count": 0 - }, - { - "step": 5400, - "loss": 6.5854, - "learning_rate": 9.98e-05, - "inf_nan_count": 0 - }, - { - "step": 5450, - "loss": 6.6012, - "learning_rate": 9.98e-05, - "inf_nan_count": 0 - }, - { - "step": 5500, - "loss": 6.5786, - "learning_rate": 9.97e-05, - "inf_nan_count": 0 - } - ], - "evaluation_results": [ - { - "step": 1000, - "paloma": 5.073320568651489e+18 - }, - { - "step": 2000, - "paloma": 1.8978577072995303e+19 - }, - { - "step": 3000, - "paloma": 3.1701596694317715e+19 - }, - { - "step": 4000, - "paloma": 2.5015965971757485e+20 - }, - { - "step": 5000, - "paloma": 2.38712860824014e+21 - } - ], - "config": { - "d_model": 96, - "n_layers": 12, - "max_seq_len": 2048, - "vocab_size": 50304, - "lr": 0.0001, - "max_steps": 200000, - "batch_size": 1 - } - }, - { - "run_name": "pico-decoder-tiny-dolma29k-v3", - "log_file": "log_20250829_020629.log", - "training_metrics": [ - { - "step": 500, - "loss": 10.8854, - "learning_rate": 3.13e-06, - "inf_nan_count": 0 - }, - { - "step": 525, - "loss": 10.889, - "learning_rate": 3.28e-06, - "inf_nan_count": 0 - }, - { - "step": 550, - "loss": 10.8846, - "learning_rate": 3.44e-06, - "inf_nan_count": 0 - }, - { - "step": 575, - "loss": 10.8657, - "learning_rate": 3.59e-06, - "inf_nan_count": 0 - }, - { - "step": 600, - "loss": 10.859, - "learning_rate": 3.75e-06, - "inf_nan_count": 0 - }, - { - "step": 625, - "loss": 10.8328, - "learning_rate": 3.91e-06, - "inf_nan_count": 0 - }, - { - "step": 650, - "loss": 10.8166, - "learning_rate": 4.06e-06, - "inf_nan_count": 0 - }, - { - "step": 675, - "loss": 10.7913, - "learning_rate": 4.22e-06, - "inf_nan_count": 0 - }, - { - "step": 700, - "loss": 10.7609, - "learning_rate": 4.37e-06, - "inf_nan_count": 0 - }, - { - "step": 725, - "loss": 10.7322, - "learning_rate": 4.53e-06, - "inf_nan_count": 0 - }, - { - "step": 750, - "loss": 10.7121, - "learning_rate": 4.69e-06, - "inf_nan_count": 0 - }, - { - "step": 775, - "loss": 10.6877, - "learning_rate": 4.84e-06, - "inf_nan_count": 0 - }, - { - "step": 800, - "loss": 10.6436, - "learning_rate": 5e-06, - "inf_nan_count": 0 - }, - { - "step": 825, - "loss": 10.6256, - "learning_rate": 5.16e-06, - "inf_nan_count": 0 - }, - { - "step": 850, - "loss": 10.5961, - "learning_rate": 5.31e-06, - "inf_nan_count": 0 - }, - { - "step": 875, - "loss": 10.5443, - "learning_rate": 5.47e-06, - "inf_nan_count": 0 - }, - { - "step": 900, - "loss": 10.5197, - "learning_rate": 5.63e-06, - "inf_nan_count": 0 - }, - { - "step": 925, - "loss": 10.4854, - "learning_rate": 5.78e-06, - "inf_nan_count": 0 - }, - { - "step": 950, - "loss": 10.4826, - "learning_rate": 5.94e-06, - "inf_nan_count": 0 - }, - { - "step": 975, - "loss": 10.4557, - "learning_rate": 6.09e-06, - "inf_nan_count": 0 - }, - { - "step": 1000, - "loss": 10.4142, - "learning_rate": 6.25e-06, - "inf_nan_count": 0 - }, - { - "step": 1025, - "loss": 10.3885, - "learning_rate": 6.41e-06, - "inf_nan_count": 0 - }, - { - "step": 1050, - "loss": 10.3737, - "learning_rate": 6.56e-06, - "inf_nan_count": 0 - }, - { - "step": 1075, - "loss": 10.3534, - "learning_rate": 6.72e-06, - "inf_nan_count": 0 - }, - { - "step": 1100, - "loss": 10.3219, - "learning_rate": 6.88e-06, - "inf_nan_count": 0 - }, - { - "step": 1125, - "loss": 10.3064, - "learning_rate": 7.03e-06, - "inf_nan_count": 0 - }, - { - "step": 1150, - "loss": 10.2761, - "learning_rate": 7.19e-06, - "inf_nan_count": 0 - }, - { - "step": 1175, - "loss": 10.2592, - "learning_rate": 7.34e-06, - "inf_nan_count": 0 - }, - { - "step": 1200, - "loss": 10.242, - "learning_rate": 7.5e-06, - "inf_nan_count": 0 - }, - { - "step": 1225, - "loss": 10.2141, - "learning_rate": 7.66e-06, - "inf_nan_count": 0 - }, - { - "step": 1250, - "loss": 10.1882, - "learning_rate": 7.81e-06, - "inf_nan_count": 0 - }, - { - "step": 1275, - "loss": 10.1608, - "learning_rate": 7.97e-06, - "inf_nan_count": 0 - }, - { - "step": 1300, - "loss": 10.146, - "learning_rate": 8.13e-06, - "inf_nan_count": 0 - }, - { - "step": 1325, - "loss": 10.0944, - "learning_rate": 8.28e-06, - "inf_nan_count": 0 - }, - { - "step": 1350, - "loss": 10.0885, - "learning_rate": 8.44e-06, - "inf_nan_count": 0 - }, - { - "step": 1375, - "loss": 10.0748, - "learning_rate": 8.59e-06, - "inf_nan_count": 0 - }, - { - "step": 1400, - "loss": 10.0425, - "learning_rate": 8.75e-06, - "inf_nan_count": 0 - }, - { - "step": 1425, - "loss": 10.0422, - "learning_rate": 8.91e-06, - "inf_nan_count": 0 - }, - { - "step": 1450, - "loss": 10.0039, - "learning_rate": 9.06e-06, - "inf_nan_count": 0 - }, - { - "step": 1475, - "loss": 9.9736, - "learning_rate": 9.22e-06, - "inf_nan_count": 0 - }, - { - "step": 1500, - "loss": 9.9729, - "learning_rate": 9.38e-06, - "inf_nan_count": 0 - }, - { - "step": 1525, - "loss": 9.9379, - "learning_rate": 9.53e-06, - "inf_nan_count": 0 - }, - { - "step": 1550, - "loss": 9.8819, - "learning_rate": 9.69e-06, - "inf_nan_count": 0 - }, - { - "step": 1575, - "loss": 9.8702, - "learning_rate": 9.84e-06, - "inf_nan_count": 0 - }, - { - "step": 1600, - "loss": 9.8571, - "learning_rate": 1e-05, - "inf_nan_count": 0 - }, - { - "step": 1625, - "loss": 9.8356, - "learning_rate": 1.02e-05, - "inf_nan_count": 0 - }, - { - "step": 1650, - "loss": 9.7973, - "learning_rate": 1.03e-05, - "inf_nan_count": 0 - }, - { - "step": 1675, - "loss": 9.7745, - "learning_rate": 1.05e-05, - "inf_nan_count": 0 - }, - { - "step": 1700, - "loss": 9.7673, - "learning_rate": 1.06e-05, - "inf_nan_count": 0 - }, - { - "step": 1725, - "loss": 9.7406, - "learning_rate": 1.08e-05, - "inf_nan_count": 0 - }, - { - "step": 1750, - "loss": 9.7312, - "learning_rate": 1.09e-05, - "inf_nan_count": 0 - }, - { - "step": 1775, - "loss": 9.6563, - "learning_rate": 1.11e-05, - "inf_nan_count": 0 - }, - { - "step": 1800, - "loss": 9.6515, - "learning_rate": 1.13e-05, - "inf_nan_count": 0 - }, - { - "step": 1825, - "loss": 9.6241, - "learning_rate": 1.14e-05, - "inf_nan_count": 0 - }, - { - "step": 1850, - "loss": 9.6015, - "learning_rate": 1.16e-05, - "inf_nan_count": 0 - }, - { - "step": 1875, - "loss": 9.5933, - "learning_rate": 1.17e-05, - "inf_nan_count": 0 - }, - { - "step": 1900, - "loss": 9.5544, - "learning_rate": 1.19e-05, - "inf_nan_count": 0 - }, - { - "step": 1925, - "loss": 9.5407, - "learning_rate": 1.2e-05, - "inf_nan_count": 0 - }, - { - "step": 1950, - "loss": 9.5431, - "learning_rate": 1.22e-05, - "inf_nan_count": 0 - }, - { - "step": 1975, - "loss": 9.4853, - "learning_rate": 1.23e-05, - "inf_nan_count": 0 - }, - { - "step": 2000, - "loss": 9.4665, - "learning_rate": 1.25e-05, - "inf_nan_count": 0 - }, - { - "step": 2025, - "loss": 9.4621, - "learning_rate": 1.27e-05, - "inf_nan_count": 0 - }, - { - "step": 2050, - "loss": 9.4031, - "learning_rate": 1.28e-05, - "inf_nan_count": 0 - }, - { - "step": 2075, - "loss": 9.3699, - "learning_rate": 1.3e-05, - "inf_nan_count": 0 - }, - { - "step": 2100, - "loss": 9.3422, - "learning_rate": 1.31e-05, - "inf_nan_count": 0 - }, - { - "step": 2125, - "loss": 9.3129, - "learning_rate": 1.33e-05, - "inf_nan_count": 0 - }, - { - "step": 2150, - "loss": 9.2917, - "learning_rate": 1.34e-05, - "inf_nan_count": 0 - }, - { - "step": 2175, - "loss": 9.267, - "learning_rate": 1.36e-05, - "inf_nan_count": 0 - }, - { - "step": 2200, - "loss": 9.2512, - "learning_rate": 1.38e-05, - "inf_nan_count": 0 - }, - { - "step": 2225, - "loss": 9.2737, - "learning_rate": 1.39e-05, - "inf_nan_count": 0 - }, - { - "step": 2250, - "loss": 9.2357, - "learning_rate": 1.41e-05, - "inf_nan_count": 0 - }, - { - "step": 2275, - "loss": 9.1471, - "learning_rate": 1.42e-05, - "inf_nan_count": 0 - }, - { - "step": 2300, - "loss": 9.1305, - "learning_rate": 1.44e-05, - "inf_nan_count": 0 - }, - { - "step": 2325, - "loss": 9.143, - "learning_rate": 1.45e-05, - "inf_nan_count": 0 - }, - { - "step": 2350, - "loss": 9.0948, - "learning_rate": 1.47e-05, - "inf_nan_count": 0 - }, - { - "step": 2375, - "loss": 9.0256, - "learning_rate": 1.48e-05, - "inf_nan_count": 0 - }, - { - "step": 2400, - "loss": 9.0664, - "learning_rate": 1.5e-05, - "inf_nan_count": 0 - }, - { - "step": 2425, - "loss": 9.002, - "learning_rate": 1.52e-05, - "inf_nan_count": 0 - }, - { - "step": 2450, - "loss": 8.9518, - "learning_rate": 1.53e-05, - "inf_nan_count": 0 - }, - { - "step": 2475, - "loss": 8.9717, - "learning_rate": 1.55e-05, - "inf_nan_count": 0 - }, - { - "step": 2500, - "loss": 8.9536, - "learning_rate": 1.56e-05, - "inf_nan_count": 0 - }, - { - "step": 2525, - "loss": 8.8812, - "learning_rate": 1.58e-05, - "inf_nan_count": 0 - }, - { - "step": 2550, - "loss": 8.8824, - "learning_rate": 1.59e-05, - "inf_nan_count": 0 - }, - { - "step": 2575, - "loss": 8.8564, - "learning_rate": 1.61e-05, - "inf_nan_count": 0 - }, - { - "step": 2600, - "loss": 8.8419, - "learning_rate": 1.63e-05, - "inf_nan_count": 0 - }, - { - "step": 2625, - "loss": 8.7865, - "learning_rate": 1.64e-05, - "inf_nan_count": 0 - }, - { - "step": 2650, - "loss": 8.7493, - "learning_rate": 1.66e-05, - "inf_nan_count": 0 - }, - { - "step": 2675, - "loss": 8.7255, - "learning_rate": 1.67e-05, - "inf_nan_count": 0 - }, - { - "step": 2700, - "loss": 8.6469, - "learning_rate": 1.69e-05, - "inf_nan_count": 0 - }, - { - "step": 2725, - "loss": 8.6799, - "learning_rate": 1.7e-05, - "inf_nan_count": 0 - }, - { - "step": 2750, - "loss": 8.6974, - "learning_rate": 1.72e-05, - "inf_nan_count": 0 - }, - { - "step": 2775, - "loss": 8.6441, - "learning_rate": 1.73e-05, - "inf_nan_count": 0 - }, - { - "step": 2800, - "loss": 8.6689, - "learning_rate": 1.75e-05, - "inf_nan_count": 0 - }, - { - "step": 2825, - "loss": 8.5732, - "learning_rate": 1.77e-05, - "inf_nan_count": 0 - }, - { - "step": 2850, - "loss": 8.5955, - "learning_rate": 1.78e-05, - "inf_nan_count": 0 - }, - { - "step": 2875, - "loss": 8.5823, - "learning_rate": 1.8e-05, - "inf_nan_count": 0 - }, - { - "step": 2900, - "loss": 8.5968, - "learning_rate": 1.81e-05, - "inf_nan_count": 0 - }, - { - "step": 2925, - "loss": 8.4721, - "learning_rate": 1.83e-05, - "inf_nan_count": 0 - }, - { - "step": 2950, - "loss": 8.4672, - "learning_rate": 1.84e-05, - "inf_nan_count": 0 - }, - { - "step": 2975, - "loss": 8.4033, - "learning_rate": 1.86e-05, - "inf_nan_count": 0 - }, - { - "step": 3000, - "loss": 8.4947, - "learning_rate": 1.88e-05, - "inf_nan_count": 0 - }, - { - "step": 3025, - "loss": 8.378, - "learning_rate": 1.89e-05, - "inf_nan_count": 0 - }, - { - "step": 3050, - "loss": 8.3581, - "learning_rate": 1.91e-05, - "inf_nan_count": 0 - }, - { - "step": 3075, - "loss": 8.3341, - "learning_rate": 1.92e-05, - "inf_nan_count": 0 - }, - { - "step": 3100, - "loss": 8.3391, - "learning_rate": 1.94e-05, - "inf_nan_count": 0 - }, - { - "step": 3125, - "loss": 8.367, - "learning_rate": 1.95e-05, - "inf_nan_count": 0 - }, - { - "step": 3150, - "loss": 8.237, - "learning_rate": 1.97e-05, - "inf_nan_count": 0 - }, - { - "step": 3175, - "loss": 8.2879, - "learning_rate": 1.98e-05, - "inf_nan_count": 0 - }, - { - "step": 3200, - "loss": 8.2706, - "learning_rate": 2e-05, - "inf_nan_count": 0 - }, - { - "step": 3225, - "loss": 8.1983, - "learning_rate": 2.02e-05, - "inf_nan_count": 0 - }, - { - "step": 3250, - "loss": 8.2174, - "learning_rate": 2.03e-05, - "inf_nan_count": 0 - }, - { - "step": 3275, - "loss": 8.2229, - "learning_rate": 2.05e-05, - "inf_nan_count": 0 - }, - { - "step": 3300, - "loss": 8.1398, - "learning_rate": 2.06e-05, - "inf_nan_count": 0 - }, - { - "step": 3325, - "loss": 8.143, - "learning_rate": 2.08e-05, - "inf_nan_count": 0 - }, - { - "step": 3350, - "loss": 8.1471, - "learning_rate": 2.09e-05, - "inf_nan_count": 0 - }, - { - "step": 3375, - "loss": 8.0908, - "learning_rate": 2.11e-05, - "inf_nan_count": 0 - }, - { - "step": 3400, - "loss": 8.1165, - "learning_rate": 2.13e-05, - "inf_nan_count": 0 - }, - { - "step": 3425, - "loss": 8.0957, - "learning_rate": 2.14e-05, - "inf_nan_count": 0 - }, - { - "step": 3450, - "loss": 8.1115, - "learning_rate": 2.16e-05, - "inf_nan_count": 0 - }, - { - "step": 3475, - "loss": 8.0623, - "learning_rate": 2.17e-05, - "inf_nan_count": 0 - }, - { - "step": 3500, - "loss": 8.0527, - "learning_rate": 2.19e-05, - "inf_nan_count": 0 - }, - { - "step": 3525, - "loss": 7.9975, - "learning_rate": 2.2e-05, - "inf_nan_count": 0 - }, - { - "step": 3550, - "loss": 7.9881, - "learning_rate": 2.22e-05, - "inf_nan_count": 0 - }, - { - "step": 3575, - "loss": 8.006, - "learning_rate": 2.23e-05, - "inf_nan_count": 0 - }, - { - "step": 3600, - "loss": 7.9366, - "learning_rate": 2.25e-05, - "inf_nan_count": 0 - }, - { - "step": 3625, - "loss": 8.0252, - "learning_rate": 2.27e-05, - "inf_nan_count": 0 - }, - { - "step": 3650, - "loss": 7.916, - "learning_rate": 2.28e-05, - "inf_nan_count": 0 - }, - { - "step": 3675, - "loss": 7.947, - "learning_rate": 2.3e-05, - "inf_nan_count": 0 - }, - { - "step": 3700, - "loss": 7.8943, - "learning_rate": 2.31e-05, - "inf_nan_count": 0 - }, - { - "step": 3725, - "loss": 7.8951, - "learning_rate": 2.33e-05, - "inf_nan_count": 0 - }, - { - "step": 3750, - "loss": 7.9316, - "learning_rate": 2.34e-05, - "inf_nan_count": 0 - }, - { - "step": 3775, - "loss": 7.9407, - "learning_rate": 2.36e-05, - "inf_nan_count": 0 - }, - { - "step": 3800, - "loss": 7.9385, - "learning_rate": 2.38e-05, - "inf_nan_count": 0 - }, - { - "step": 3825, - "loss": 7.88, - "learning_rate": 2.39e-05, - "inf_nan_count": 0 - }, - { - "step": 3850, - "loss": 7.9207, - "learning_rate": 2.41e-05, - "inf_nan_count": 0 - }, - { - "step": 3875, - "loss": 7.8258, - "learning_rate": 2.42e-05, - "inf_nan_count": 0 - }, - { - "step": 3900, - "loss": 7.9005, - "learning_rate": 2.44e-05, - "inf_nan_count": 0 - }, - { - "step": 3925, - "loss": 7.8232, - "learning_rate": 2.45e-05, - "inf_nan_count": 0 - }, - { - "step": 3950, - "loss": 7.7847, - "learning_rate": 2.47e-05, - "inf_nan_count": 0 - }, - { - "step": 3975, - "loss": 7.7909, - "learning_rate": 2.48e-05, - "inf_nan_count": 0 - }, - { - "step": 4000, - "loss": 7.7419, - "learning_rate": 2.5e-05, - "inf_nan_count": 0 - }, - { - "step": 4025, - "loss": 7.8031, - "learning_rate": 2.52e-05, - "inf_nan_count": 0 - }, - { - "step": 4050, - "loss": 7.7948, - "learning_rate": 2.53e-05, - "inf_nan_count": 0 - }, - { - "step": 4075, - "loss": 7.7259, - "learning_rate": 2.55e-05, - "inf_nan_count": 0 - }, - { - "step": 4100, - "loss": 7.8406, - "learning_rate": 2.56e-05, - "inf_nan_count": 0 - }, - { - "step": 4125, - "loss": 7.7938, - "learning_rate": 2.58e-05, - "inf_nan_count": 0 - }, - { - "step": 4150, - "loss": 7.7101, - "learning_rate": 2.59e-05, - "inf_nan_count": 0 - }, - { - "step": 4175, - "loss": 7.6633, - "learning_rate": 2.61e-05, - "inf_nan_count": 0 - }, - { - "step": 4200, - "loss": 7.683, - "learning_rate": 2.63e-05, - "inf_nan_count": 0 - }, - { - "step": 4225, - "loss": 7.7106, - "learning_rate": 2.64e-05, - "inf_nan_count": 0 - }, - { - "step": 4250, - "loss": 7.7174, - "learning_rate": 2.66e-05, - "inf_nan_count": 0 - }, - { - "step": 4275, - "loss": 7.7508, - "learning_rate": 2.67e-05, - "inf_nan_count": 0 - }, - { - "step": 4300, - "loss": 7.6831, - "learning_rate": 2.69e-05, - "inf_nan_count": 0 - }, - { - "step": 4325, - "loss": 7.6498, - "learning_rate": 2.7e-05, - "inf_nan_count": 0 - }, - { - "step": 4350, - "loss": 7.6668, - "learning_rate": 2.72e-05, - "inf_nan_count": 0 - }, - { - "step": 4375, - "loss": 7.6852, - "learning_rate": 2.73e-05, - "inf_nan_count": 0 - }, - { - "step": 4400, - "loss": 7.6469, - "learning_rate": 2.75e-05, - "inf_nan_count": 0 - }, - { - "step": 4425, - "loss": 7.7448, - "learning_rate": 2.77e-05, - "inf_nan_count": 0 - }, - { - "step": 4450, - "loss": 7.7422, - "learning_rate": 2.78e-05, - "inf_nan_count": 0 - }, - { - "step": 4475, - "loss": 7.6918, - "learning_rate": 2.8e-05, - "inf_nan_count": 0 - }, - { - "step": 4500, - "loss": 7.7084, - "learning_rate": 2.81e-05, - "inf_nan_count": 0 - }, - { - "step": 4525, - "loss": 7.722, - "learning_rate": 2.83e-05, - "inf_nan_count": 0 - }, - { - "step": 4550, - "loss": 7.6893, - "learning_rate": 2.84e-05, - "inf_nan_count": 0 - }, - { - "step": 4575, - "loss": 7.6454, - "learning_rate": 2.86e-05, - "inf_nan_count": 0 - }, - { - "step": 4600, - "loss": 7.6298, - "learning_rate": 2.87e-05, - "inf_nan_count": 0 - }, - { - "step": 4625, - "loss": 7.642, - "learning_rate": 2.89e-05, - "inf_nan_count": 0 - }, - { - "step": 4650, - "loss": 7.6247, - "learning_rate": 2.91e-05, - "inf_nan_count": 0 - }, - { - "step": 4675, - "loss": 7.6448, - "learning_rate": 2.92e-05, - "inf_nan_count": 0 - }, - { - "step": 4700, - "loss": 7.6506, - "learning_rate": 2.94e-05, - "inf_nan_count": 0 - }, - { - "step": 4725, - "loss": 7.6356, - "learning_rate": 2.95e-05, - "inf_nan_count": 0 - }, - { - "step": 4750, - "loss": 7.6426, - "learning_rate": 2.97e-05, - "inf_nan_count": 0 - }, - { - "step": 4775, - "loss": 7.6388, - "learning_rate": 2.98e-05, - "inf_nan_count": 0 - }, - { - "step": 4800, - "loss": 7.5216, - "learning_rate": 3e-05, - "inf_nan_count": 0 - }, - { - "step": 4825, - "loss": 7.5367, - "learning_rate": 3.02e-05, - "inf_nan_count": 0 - }, - { - "step": 4850, - "loss": 7.5084, - "learning_rate": 3.03e-05, - "inf_nan_count": 0 - }, - { - "step": 4875, - "loss": 7.6092, - "learning_rate": 3.05e-05, - "inf_nan_count": 0 - }, - { - "step": 4900, - "loss": 7.576, - "learning_rate": 3.06e-05, - "inf_nan_count": 0 - }, - { - "step": 4925, - "loss": 7.5686, - "learning_rate": 3.08e-05, - "inf_nan_count": 0 - }, - { - "step": 4950, - "loss": 7.5583, - "learning_rate": 3.09e-05, - "inf_nan_count": 0 - }, - { - "step": 4975, - "loss": 7.5818, - "learning_rate": 3.11e-05, - "inf_nan_count": 0 - }, - { - "step": 5000, - "loss": 7.6004, - "learning_rate": 3.13e-05, - "inf_nan_count": 0 - }, - { - "step": 5025, - "loss": 7.5371, - "learning_rate": 3.14e-05, - "inf_nan_count": 0 - }, - { - "step": 5050, - "loss": 7.5179, - "learning_rate": 3.16e-05, - "inf_nan_count": 0 - }, - { - "step": 5075, - "loss": 7.5255, - "learning_rate": 3.17e-05, - "inf_nan_count": 0 - }, - { - "step": 5100, - "loss": 7.5155, - "learning_rate": 3.19e-05, - "inf_nan_count": 0 - }, - { - "step": 5125, - "loss": 7.566, - "learning_rate": 3.2e-05, - "inf_nan_count": 0 - }, - { - "step": 5150, - "loss": 7.4797, - "learning_rate": 3.22e-05, - "inf_nan_count": 0 - }, - { - "step": 5175, - "loss": 7.6224, - "learning_rate": 3.23e-05, - "inf_nan_count": 0 - }, - { - "step": 5200, - "loss": 7.4821, - "learning_rate": 3.25e-05, - "inf_nan_count": 0 - }, - { - "step": 5225, - "loss": 7.4765, - "learning_rate": 3.27e-05, - "inf_nan_count": 0 - }, - { - "step": 5250, - "loss": 7.468, - "learning_rate": 3.28e-05, - "inf_nan_count": 0 - }, - { - "step": 5275, - "loss": 7.5165, - "learning_rate": 3.3e-05, - "inf_nan_count": 0 - }, - { - "step": 5300, - "loss": 7.5334, - "learning_rate": 3.31e-05, - "inf_nan_count": 0 - }, - { - "step": 5325, - "loss": 7.5053, - "learning_rate": 3.33e-05, - "inf_nan_count": 0 - }, - { - "step": 5350, - "loss": 7.5115, - "learning_rate": 3.34e-05, - "inf_nan_count": 0 - }, - { - "step": 5375, - "loss": 7.4736, - "learning_rate": 3.36e-05, - "inf_nan_count": 0 - }, - { - "step": 5400, - "loss": 7.452, - "learning_rate": 3.38e-05, - "inf_nan_count": 0 - }, - { - "step": 5425, - "loss": 7.4596, - "learning_rate": 3.39e-05, - "inf_nan_count": 0 - }, - { - "step": 5450, - "loss": 7.4518, - "learning_rate": 3.41e-05, - "inf_nan_count": 0 - }, - { - "step": 5475, - "loss": 7.4308, - "learning_rate": 3.42e-05, - "inf_nan_count": 0 - }, - { - "step": 5500, - "loss": 7.4627, - "learning_rate": 3.44e-05, - "inf_nan_count": 0 - }, - { - "step": 5525, - "loss": 7.4095, - "learning_rate": 3.45e-05, - "inf_nan_count": 0 - }, - { - "step": 5550, - "loss": 7.4423, - "learning_rate": 3.47e-05, - "inf_nan_count": 0 - }, - { - "step": 5575, - "loss": 7.46, - "learning_rate": 3.48e-05, - "inf_nan_count": 0 - }, - { - "step": 5600, - "loss": 7.3457, - "learning_rate": 3.5e-05, - "inf_nan_count": 0 - }, - { - "step": 5625, - "loss": 7.4838, - "learning_rate": 3.52e-05, - "inf_nan_count": 0 - }, - { - "step": 5650, - "loss": 7.4556, - "learning_rate": 3.53e-05, - "inf_nan_count": 0 - }, - { - "step": 5675, - "loss": 7.422, - "learning_rate": 3.55e-05, - "inf_nan_count": 0 - }, - { - "step": 5700, - "loss": 7.4307, - "learning_rate": 3.56e-05, - "inf_nan_count": 0 - }, - { - "step": 5725, - "loss": 7.3795, - "learning_rate": 3.58e-05, - "inf_nan_count": 0 - }, - { - "step": 5750, - "loss": 7.3855, - "learning_rate": 3.59e-05, - "inf_nan_count": 0 - }, - { - "step": 5775, - "loss": 7.3518, - "learning_rate": 3.61e-05, - "inf_nan_count": 0 - }, - { - "step": 5800, - "loss": 7.3794, - "learning_rate": 3.63e-05, - "inf_nan_count": 0 - }, - { - "step": 5825, - "loss": 7.3591, - "learning_rate": 3.64e-05, - "inf_nan_count": 0 - }, - { - "step": 5850, - "loss": 7.3489, - "learning_rate": 3.66e-05, - "inf_nan_count": 0 - }, - { - "step": 5875, - "loss": 7.4108, - "learning_rate": 3.67e-05, - "inf_nan_count": 0 - }, - { - "step": 5900, - "loss": 7.358, - "learning_rate": 3.69e-05, - "inf_nan_count": 0 - }, - { - "step": 5925, - "loss": 7.3131, - "learning_rate": 3.7e-05, - "inf_nan_count": 0 - }, - { - "step": 5950, - "loss": 7.2905, - "learning_rate": 3.72e-05, - "inf_nan_count": 0 - }, - { - "step": 5975, - "loss": 7.3466, - "learning_rate": 3.73e-05, - "inf_nan_count": 0 - }, - { - "step": 6000, - "loss": 7.3765, - "learning_rate": 3.75e-05, - "inf_nan_count": 0 - }, - { - "step": 6025, - "loss": 7.287, - "learning_rate": 3.77e-05, - "inf_nan_count": 0 - }, - { - "step": 6050, - "loss": 7.3333, - "learning_rate": 3.78e-05, - "inf_nan_count": 0 - }, - { - "step": 6075, - "loss": 7.3098, - "learning_rate": 3.8e-05, - "inf_nan_count": 0 - }, - { - "step": 6100, - "loss": 7.2594, - "learning_rate": 3.81e-05, - "inf_nan_count": 0 - }, - { - "step": 6125, - "loss": 7.3327, - "learning_rate": 3.83e-05, - "inf_nan_count": 0 - }, - { - "step": 6150, - "loss": 7.303, - "learning_rate": 3.84e-05, - "inf_nan_count": 0 - }, - { - "step": 6175, - "loss": 7.2523, - "learning_rate": 3.86e-05, - "inf_nan_count": 0 - }, - { - "step": 6200, - "loss": 7.2546, - "learning_rate": 3.87e-05, - "inf_nan_count": 0 - }, - { - "step": 6225, - "loss": 7.3242, - "learning_rate": 3.89e-05, - "inf_nan_count": 0 - }, - { - "step": 6250, - "loss": 7.2035, - "learning_rate": 3.91e-05, - "inf_nan_count": 0 - }, - { - "step": 6275, - "loss": 7.2334, - "learning_rate": 3.92e-05, - "inf_nan_count": 0 - }, - { - "step": 6300, - "loss": 7.2295, - "learning_rate": 3.94e-05, - "inf_nan_count": 0 - }, - { - "step": 6325, - "loss": 7.3051, - "learning_rate": 3.95e-05, - "inf_nan_count": 0 - }, - { - "step": 6350, - "loss": 7.3188, - "learning_rate": 3.97e-05, - "inf_nan_count": 0 - }, - { - "step": 6375, - "loss": 7.3212, - "learning_rate": 3.98e-05, - "inf_nan_count": 0 - }, - { - "step": 6400, - "loss": 7.2465, - "learning_rate": 4e-05, - "inf_nan_count": 0 - }, - { - "step": 6425, - "loss": 7.2081, - "learning_rate": 4.02e-05, - "inf_nan_count": 0 - }, - { - "step": 6450, - "loss": 7.2852, - "learning_rate": 4.03e-05, - "inf_nan_count": 0 - }, - { - "step": 6475, - "loss": 7.2074, - "learning_rate": 4.05e-05, - "inf_nan_count": 0 - }, - { - "step": 6500, - "loss": 7.252, - "learning_rate": 4.06e-05, - "inf_nan_count": 0 - }, - { - "step": 6525, - "loss": 7.2115, - "learning_rate": 4.08e-05, - "inf_nan_count": 0 - }, - { - "step": 6550, - "loss": 7.2435, - "learning_rate": 4.09e-05, - "inf_nan_count": 0 - }, - { - "step": 6575, - "loss": 7.1962, - "learning_rate": 4.11e-05, - "inf_nan_count": 0 - }, - { - "step": 6600, - "loss": 7.1631, - "learning_rate": 4.12e-05, - "inf_nan_count": 0 - }, - { - "step": 6625, - "loss": 7.2525, - "learning_rate": 4.14e-05, - "inf_nan_count": 0 - }, - { - "step": 6650, - "loss": 7.2133, - "learning_rate": 4.16e-05, - "inf_nan_count": 0 - }, - { - "step": 6675, - "loss": 7.2248, - "learning_rate": 4.17e-05, - "inf_nan_count": 0 - }, - { - "step": 6700, - "loss": 7.1928, - "learning_rate": 4.19e-05, - "inf_nan_count": 0 - }, - { - "step": 6725, - "loss": 7.1698, - "learning_rate": 4.2e-05, - "inf_nan_count": 0 - }, - { - "step": 6750, - "loss": 7.3037, - "learning_rate": 4.22e-05, - "inf_nan_count": 0 - }, - { - "step": 6775, - "loss": 7.2451, - "learning_rate": 4.23e-05, - "inf_nan_count": 0 - }, - { - "step": 6800, - "loss": 7.1373, - "learning_rate": 4.25e-05, - "inf_nan_count": 0 - }, - { - "step": 6825, - "loss": 7.139, - "learning_rate": 4.27e-05, - "inf_nan_count": 0 - }, - { - "step": 6850, - "loss": 7.1296, - "learning_rate": 4.28e-05, - "inf_nan_count": 0 - }, - { - "step": 6875, - "loss": 7.0961, - "learning_rate": 4.3e-05, - "inf_nan_count": 0 - }, - { - "step": 6900, - "loss": 7.1408, - "learning_rate": 4.31e-05, - "inf_nan_count": 0 - }, - { - "step": 6925, - "loss": 7.1852, - "learning_rate": 4.33e-05, - "inf_nan_count": 0 - }, - { - "step": 6950, - "loss": 7.2067, - "learning_rate": 4.34e-05, - "inf_nan_count": 0 - }, - { - "step": 6975, - "loss": 7.0681, - "learning_rate": 4.36e-05, - "inf_nan_count": 0 - }, - { - "step": 7000, - "loss": 7.1813, - "learning_rate": 4.37e-05, - "inf_nan_count": 0 - }, - { - "step": 7025, - "loss": 7.1992, - "learning_rate": 4.39e-05, - "inf_nan_count": 0 - }, - { - "step": 7050, - "loss": 7.1409, - "learning_rate": 4.41e-05, - "inf_nan_count": 0 - }, - { - "step": 7075, - "loss": 7.1271, - "learning_rate": 4.42e-05, - "inf_nan_count": 0 - }, - { - "step": 7100, - "loss": 7.172, - "learning_rate": 4.44e-05, - "inf_nan_count": 0 - }, - { - "step": 7125, - "loss": 7.1515, - "learning_rate": 4.45e-05, - "inf_nan_count": 0 - }, - { - "step": 7150, - "loss": 7.0898, - "learning_rate": 4.47e-05, - "inf_nan_count": 0 - }, - { - "step": 7175, - "loss": 7.0996, - "learning_rate": 4.48e-05, - "inf_nan_count": 0 - }, - { - "step": 7200, - "loss": 7.061, - "learning_rate": 4.5e-05, - "inf_nan_count": 0 - }, - { - "step": 7225, - "loss": 7.1939, - "learning_rate": 4.52e-05, - "inf_nan_count": 0 - }, - { - "step": 7250, - "loss": 7.0355, - "learning_rate": 4.53e-05, - "inf_nan_count": 0 - }, - { - "step": 7275, - "loss": 7.0935, - "learning_rate": 4.55e-05, - "inf_nan_count": 0 - }, - { - "step": 7300, - "loss": 7.0689, - "learning_rate": 4.56e-05, - "inf_nan_count": 0 - }, - { - "step": 7325, - "loss": 7.0265, - "learning_rate": 4.58e-05, - "inf_nan_count": 0 - }, - { - "step": 7350, - "loss": 7.0963, - "learning_rate": 4.59e-05, - "inf_nan_count": 0 - }, - { - "step": 7375, - "loss": 7.1138, - "learning_rate": 4.61e-05, - "inf_nan_count": 0 - }, - { - "step": 7400, - "loss": 7.0414, - "learning_rate": 4.63e-05, - "inf_nan_count": 0 - }, - { - "step": 7425, - "loss": 7.0753, - "learning_rate": 4.64e-05, - "inf_nan_count": 0 - }, - { - "step": 7450, - "loss": 7.0603, - "learning_rate": 4.66e-05, - "inf_nan_count": 0 - }, - { - "step": 7475, - "loss": 7.0818, - "learning_rate": 4.67e-05, - "inf_nan_count": 0 - }, - { - "step": 7500, - "loss": 7.0788, - "learning_rate": 4.69e-05, - "inf_nan_count": 0 - }, - { - "step": 7525, - "loss": 6.9952, - "learning_rate": 4.7e-05, - "inf_nan_count": 0 - }, - { - "step": 7550, - "loss": 7.0114, - "learning_rate": 4.72e-05, - "inf_nan_count": 0 - }, - { - "step": 7575, - "loss": 7.0611, - "learning_rate": 4.73e-05, - "inf_nan_count": 0 - }, - { - "step": 7600, - "loss": 7.0057, - "learning_rate": 4.75e-05, - "inf_nan_count": 0 - }, - { - "step": 7625, - "loss": 7.0182, - "learning_rate": 4.77e-05, - "inf_nan_count": 0 - }, - { - "step": 7650, - "loss": 7.0271, - "learning_rate": 4.78e-05, - "inf_nan_count": 0 - }, - { - "step": 7675, - "loss": 7.0817, - "learning_rate": 4.8e-05, - "inf_nan_count": 0 - }, - { - "step": 7700, - "loss": 7.0859, - "learning_rate": 4.81e-05, - "inf_nan_count": 0 - }, - { - "step": 7725, - "loss": 6.9859, - "learning_rate": 4.83e-05, - "inf_nan_count": 0 - }, - { - "step": 7750, - "loss": 7.038, - "learning_rate": 4.84e-05, - "inf_nan_count": 0 - }, - { - "step": 7775, - "loss": 6.9784, - "learning_rate": 4.86e-05, - "inf_nan_count": 0 - }, - { - "step": 7800, - "loss": 7.0304, - "learning_rate": 4.87e-05, - "inf_nan_count": 0 - }, - { - "step": 7825, - "loss": 7.0, - "learning_rate": 4.89e-05, - "inf_nan_count": 0 - }, - { - "step": 7850, - "loss": 7.0159, - "learning_rate": 4.91e-05, - "inf_nan_count": 0 - }, - { - "step": 7875, - "loss": 6.9859, - "learning_rate": 4.92e-05, - "inf_nan_count": 0 - }, - { - "step": 7900, - "loss": 6.9348, - "learning_rate": 4.94e-05, - "inf_nan_count": 0 - }, - { - "step": 7925, - "loss": 6.9541, - "learning_rate": 4.95e-05, - "inf_nan_count": 0 - }, - { - "step": 7950, - "loss": 6.9342, - "learning_rate": 4.97e-05, - "inf_nan_count": 0 - }, - { - "step": 7975, - "loss": 7.0294, - "learning_rate": 4.98e-05, - "inf_nan_count": 0 - }, - { - "step": 8000, - "loss": 7.0412, - "learning_rate": 5e-05, - "inf_nan_count": 0 - }, - { - "step": 8025, - "loss": 6.9111, - "learning_rate": 4.99e-05, - "inf_nan_count": 0 - }, - { - "step": 8050, - "loss": 7.0142, - "learning_rate": 4.98e-05, - "inf_nan_count": 0 - }, - { - "step": 8075, - "loss": 6.9201, - "learning_rate": 4.97e-05, - "inf_nan_count": 0 - }, - { - "step": 8100, - "loss": 6.91, - "learning_rate": 4.96e-05, - "inf_nan_count": 0 - }, - { - "step": 8125, - "loss": 6.9728, - "learning_rate": 4.95e-05, - "inf_nan_count": 0 - }, - { - "step": 8150, - "loss": 6.9963, - "learning_rate": 4.94e-05, - "inf_nan_count": 0 - }, - { - "step": 8175, - "loss": 7.0077, - "learning_rate": 4.93e-05, - "inf_nan_count": 0 - }, - { - "step": 8200, - "loss": 6.8808, - "learning_rate": 4.92e-05, - "inf_nan_count": 0 - }, - { - "step": 8225, - "loss": 6.85, - "learning_rate": 4.91e-05, - "inf_nan_count": 0 - }, - { - "step": 8250, - "loss": 6.9328, - "learning_rate": 4.9e-05, - "inf_nan_count": 0 - }, - { - "step": 8275, - "loss": 6.8971, - "learning_rate": 4.89e-05, - "inf_nan_count": 0 - }, - { - "step": 8300, - "loss": 6.9635, - "learning_rate": 4.87e-05, - "inf_nan_count": 0 - }, - { - "step": 8325, - "loss": 6.8937, - "learning_rate": 4.86e-05, - "inf_nan_count": 0 - }, - { - "step": 8350, - "loss": 6.8578, - "learning_rate": 4.85e-05, - "inf_nan_count": 0 - }, - { - "step": 8375, - "loss": 6.9492, - "learning_rate": 4.84e-05, - "inf_nan_count": 0 - }, - { - "step": 8400, - "loss": 6.8896, - "learning_rate": 4.83e-05, - "inf_nan_count": 0 - }, - { - "step": 8425, - "loss": 6.9677, - "learning_rate": 4.82e-05, - "inf_nan_count": 0 - }, - { - "step": 8450, - "loss": 6.9071, - "learning_rate": 4.81e-05, - "inf_nan_count": 0 - }, - { - "step": 8475, - "loss": 6.8973, - "learning_rate": 4.8e-05, - "inf_nan_count": 0 - }, - { - "step": 8500, - "loss": 6.9139, - "learning_rate": 4.79e-05, - "inf_nan_count": 0 - }, - { - "step": 8525, - "loss": 6.8983, - "learning_rate": 4.78e-05, - "inf_nan_count": 0 - }, - { - "step": 8550, - "loss": 6.8446, - "learning_rate": 4.77e-05, - "inf_nan_count": 0 - }, - { - "step": 8575, - "loss": 6.8246, - "learning_rate": 4.76e-05, - "inf_nan_count": 0 - }, - { - "step": 8600, - "loss": 6.9637, - "learning_rate": 4.75e-05, - "inf_nan_count": 0 - }, - { - "step": 8625, - "loss": 6.8827, - "learning_rate": 4.74e-05, - "inf_nan_count": 0 - }, - { - "step": 8650, - "loss": 6.8234, - "learning_rate": 4.73e-05, - "inf_nan_count": 0 - }, - { - "step": 8675, - "loss": 6.827, - "learning_rate": 4.72e-05, - "inf_nan_count": 0 - }, - { - "step": 8700, - "loss": 6.9554, - "learning_rate": 4.71e-05, - "inf_nan_count": 0 - }, - { - "step": 8725, - "loss": 6.8406, - "learning_rate": 4.7e-05, - "inf_nan_count": 0 - }, - { - "step": 8750, - "loss": 6.8328, - "learning_rate": 4.69e-05, - "inf_nan_count": 0 - }, - { - "step": 8775, - "loss": 6.8362, - "learning_rate": 4.68e-05, - "inf_nan_count": 0 - }, - { - "step": 8800, - "loss": 6.8417, - "learning_rate": 4.67e-05, - "inf_nan_count": 0 - }, - { - "step": 8825, - "loss": 6.8248, - "learning_rate": 4.66e-05, - "inf_nan_count": 0 - }, - { - "step": 8850, - "loss": 6.7996, - "learning_rate": 4.65e-05, - "inf_nan_count": 0 - }, - { - "step": 8875, - "loss": 6.7804, - "learning_rate": 4.64e-05, - "inf_nan_count": 0 - }, - { - "step": 8900, - "loss": 6.8802, - "learning_rate": 4.63e-05, - "inf_nan_count": 0 - }, - { - "step": 8925, - "loss": 6.8586, - "learning_rate": 4.61e-05, - "inf_nan_count": 0 - }, - { - "step": 8950, - "loss": 6.8489, - "learning_rate": 4.6e-05, - "inf_nan_count": 0 - }, - { - "step": 8975, - "loss": 6.8592, - "learning_rate": 4.59e-05, - "inf_nan_count": 0 - }, - { - "step": 9000, - "loss": 6.8302, - "learning_rate": 4.58e-05, - "inf_nan_count": 0 - }, - { - "step": 9025, - "loss": 6.831, - "learning_rate": 4.57e-05, - "inf_nan_count": 0 - }, - { - "step": 9050, - "loss": 6.7991, - "learning_rate": 4.56e-05, - "inf_nan_count": 0 - }, - { - "step": 9075, - "loss": 6.8311, - "learning_rate": 4.55e-05, - "inf_nan_count": 0 - }, - { - "step": 9100, - "loss": 6.7647, - "learning_rate": 4.54e-05, - "inf_nan_count": 0 - }, - { - "step": 9125, - "loss": 6.8225, - "learning_rate": 4.53e-05, - "inf_nan_count": 0 - }, - { - "step": 9150, - "loss": 6.7571, - "learning_rate": 4.52e-05, - "inf_nan_count": 0 - }, - { - "step": 9175, - "loss": 6.806, - "learning_rate": 4.51e-05, - "inf_nan_count": 0 - }, - { - "step": 9200, - "loss": 6.8348, - "learning_rate": 4.5e-05, - "inf_nan_count": 0 - }, - { - "step": 9225, - "loss": 6.9131, - "learning_rate": 4.49e-05, - "inf_nan_count": 0 - }, - { - "step": 9250, - "loss": 6.7801, - "learning_rate": 4.48e-05, - "inf_nan_count": 0 - }, - { - "step": 9275, - "loss": 6.7776, - "learning_rate": 4.47e-05, - "inf_nan_count": 0 - }, - { - "step": 9300, - "loss": 6.716, - "learning_rate": 4.46e-05, - "inf_nan_count": 0 - }, - { - "step": 9325, - "loss": 6.8958, - "learning_rate": 4.45e-05, - "inf_nan_count": 0 - }, - { - "step": 9350, - "loss": 6.8734, - "learning_rate": 4.44e-05, - "inf_nan_count": 0 - }, - { - "step": 9375, - "loss": 6.7203, - "learning_rate": 4.43e-05, - "inf_nan_count": 0 - }, - { - "step": 9400, - "loss": 6.7133, - "learning_rate": 4.42e-05, - "inf_nan_count": 0 - }, - { - "step": 9425, - "loss": 6.8392, - "learning_rate": 4.41e-05, - "inf_nan_count": 0 - }, - { - "step": 9450, - "loss": 6.7945, - "learning_rate": 4.4e-05, - "inf_nan_count": 0 - }, - { - "step": 9475, - "loss": 6.7831, - "learning_rate": 4.39e-05, - "inf_nan_count": 0 - }, - { - "step": 9500, - "loss": 6.7336, - "learning_rate": 4.37e-05, - "inf_nan_count": 0 - }, - { - "step": 9525, - "loss": 6.7529, - "learning_rate": 4.36e-05, - "inf_nan_count": 0 - }, - { - "step": 9550, - "loss": 6.6838, - "learning_rate": 4.35e-05, - "inf_nan_count": 0 - }, - { - "step": 9575, - "loss": 6.7548, - "learning_rate": 4.34e-05, - "inf_nan_count": 0 - }, - { - "step": 9600, - "loss": 6.8837, - "learning_rate": 4.33e-05, - "inf_nan_count": 0 - }, - { - "step": 9625, - "loss": 6.8271, - "learning_rate": 4.32e-05, - "inf_nan_count": 0 - }, - { - "step": 9650, - "loss": 6.7446, - "learning_rate": 4.31e-05, - "inf_nan_count": 0 - }, - { - "step": 9675, - "loss": 6.6811, - "learning_rate": 4.3e-05, - "inf_nan_count": 0 - }, - { - "step": 9700, - "loss": 6.7641, - "learning_rate": 4.29e-05, - "inf_nan_count": 0 - }, - { - "step": 9725, - "loss": 6.6779, - "learning_rate": 4.28e-05, - "inf_nan_count": 0 - }, - { - "step": 9750, - "loss": 6.7428, - "learning_rate": 4.27e-05, - "inf_nan_count": 0 - }, - { - "step": 9775, - "loss": 6.7698, - "learning_rate": 4.26e-05, - "inf_nan_count": 0 - }, - { - "step": 9800, - "loss": 6.7282, - "learning_rate": 4.25e-05, - "inf_nan_count": 0 - }, - { - "step": 9825, - "loss": 6.7314, - "learning_rate": 4.24e-05, - "inf_nan_count": 0 - }, - { - "step": 9850, - "loss": 6.7281, - "learning_rate": 4.23e-05, - "inf_nan_count": 0 - }, - { - "step": 9875, - "loss": 6.8553, - "learning_rate": 4.22e-05, - "inf_nan_count": 0 - }, - { - "step": 9900, - "loss": 6.7912, - "learning_rate": 4.21e-05, - "inf_nan_count": 0 - }, - { - "step": 9925, - "loss": 6.7301, - "learning_rate": 4.2e-05, - "inf_nan_count": 0 - }, - { - "step": 9950, - "loss": 6.7467, - "learning_rate": 4.19e-05, - "inf_nan_count": 0 - }, - { - "step": 9975, - "loss": 6.6581, - "learning_rate": 4.18e-05, - "inf_nan_count": 0 - }, - { - "step": 10000, - "loss": 6.7114, - "learning_rate": 4.17e-05, - "inf_nan_count": 0 - }, - { - "step": 10025, - "loss": 6.7754, - "learning_rate": 4.16e-05, - "inf_nan_count": 0 - }, - { - "step": 10050, - "loss": 6.695, - "learning_rate": 4.15e-05, - "inf_nan_count": 0 - }, - { - "step": 10075, - "loss": 6.6791, - "learning_rate": 4.14e-05, - "inf_nan_count": 0 - }, - { - "step": 10100, - "loss": 6.6957, - "learning_rate": 4.12e-05, - "inf_nan_count": 0 - }, - { - "step": 10125, - "loss": 6.7073, - "learning_rate": 4.11e-05, - "inf_nan_count": 0 - }, - { - "step": 10150, - "loss": 6.774, - "learning_rate": 4.1e-05, - "inf_nan_count": 0 - }, - { - "step": 10175, - "loss": 6.8045, - "learning_rate": 4.09e-05, - "inf_nan_count": 0 - }, - { - "step": 10200, - "loss": 6.761, - "learning_rate": 4.08e-05, - "inf_nan_count": 0 - }, - { - "step": 10225, - "loss": 6.6995, - "learning_rate": 4.07e-05, - "inf_nan_count": 0 - }, - { - "step": 10250, - "loss": 6.6779, - "learning_rate": 4.06e-05, - "inf_nan_count": 0 - }, - { - "step": 10275, - "loss": 6.7462, - "learning_rate": 4.05e-05, - "inf_nan_count": 0 - }, - { - "step": 10300, - "loss": 6.7099, - "learning_rate": 4.04e-05, - "inf_nan_count": 0 - }, - { - "step": 10325, - "loss": 6.7013, - "learning_rate": 4.03e-05, - "inf_nan_count": 0 - }, - { - "step": 10350, - "loss": 6.7173, - "learning_rate": 4.02e-05, - "inf_nan_count": 0 - }, - { - "step": 10375, - "loss": 6.6967, - "learning_rate": 4.01e-05, - "inf_nan_count": 0 - }, - { - "step": 10400, - "loss": 6.7565, - "learning_rate": 4e-05, - "inf_nan_count": 0 - }, - { - "step": 10425, - "loss": 6.7468, - "learning_rate": 3.99e-05, - "inf_nan_count": 0 - }, - { - "step": 10450, - "loss": 6.7132, - "learning_rate": 3.98e-05, - "inf_nan_count": 0 - }, - { - "step": 10475, - "loss": 6.6358, - "learning_rate": 3.97e-05, - "inf_nan_count": 0 - }, - { - "step": 10500, - "loss": 6.6979, - "learning_rate": 3.96e-05, - "inf_nan_count": 0 - }, - { - "step": 10525, - "loss": 6.6512, - "learning_rate": 3.95e-05, - "inf_nan_count": 0 - }, - { - "step": 10550, - "loss": 6.6045, - "learning_rate": 3.94e-05, - "inf_nan_count": 0 - }, - { - "step": 10575, - "loss": 6.6217, - "learning_rate": 3.93e-05, - "inf_nan_count": 0 - }, - { - "step": 10600, - "loss": 6.7091, - "learning_rate": 3.92e-05, - "inf_nan_count": 0 - }, - { - "step": 10625, - "loss": 6.618, - "learning_rate": 3.91e-05, - "inf_nan_count": 0 - }, - { - "step": 10650, - "loss": 6.6743, - "learning_rate": 3.9e-05, - "inf_nan_count": 0 - }, - { - "step": 10675, - "loss": 6.6481, - "learning_rate": 3.89e-05, - "inf_nan_count": 0 - }, - { - "step": 10700, - "loss": 6.6888, - "learning_rate": 3.87e-05, - "inf_nan_count": 0 - }, - { - "step": 10725, - "loss": 6.5786, - "learning_rate": 3.86e-05, - "inf_nan_count": 0 - }, - { - "step": 10750, - "loss": 6.6917, - "learning_rate": 3.85e-05, - "inf_nan_count": 0 - }, - { - "step": 10775, - "loss": 6.6487, - "learning_rate": 3.84e-05, - "inf_nan_count": 0 - }, - { - "step": 10800, - "loss": 6.7293, - "learning_rate": 3.83e-05, - "inf_nan_count": 0 - }, - { - "step": 10825, - "loss": 6.6369, - "learning_rate": 3.82e-05, - "inf_nan_count": 0 - }, - { - "step": 10850, - "loss": 6.7118, - "learning_rate": 3.81e-05, - "inf_nan_count": 0 - }, - { - "step": 10875, - "loss": 6.7235, - "learning_rate": 3.8e-05, - "inf_nan_count": 0 - }, - { - "step": 10900, - "loss": 6.6963, - "learning_rate": 3.79e-05, - "inf_nan_count": 0 - }, - { - "step": 10925, - "loss": 6.6791, - "learning_rate": 3.78e-05, - "inf_nan_count": 0 - }, - { - "step": 10950, - "loss": 6.6773, - "learning_rate": 3.77e-05, - "inf_nan_count": 0 - }, - { - "step": 10975, - "loss": 6.6819, - "learning_rate": 3.76e-05, - "inf_nan_count": 0 - }, - { - "step": 11000, - "loss": 6.6167, - "learning_rate": 3.75e-05, - "inf_nan_count": 0 - }, - { - "step": 11025, - "loss": 6.6727, - "learning_rate": 3.74e-05, - "inf_nan_count": 0 - }, - { - "step": 11050, - "loss": 6.6317, - "learning_rate": 3.73e-05, - "inf_nan_count": 0 - }, - { - "step": 11075, - "loss": 6.6432, - "learning_rate": 3.72e-05, - "inf_nan_count": 0 - }, - { - "step": 11100, - "loss": 6.6468, - "learning_rate": 3.71e-05, - "inf_nan_count": 0 - }, - { - "step": 11125, - "loss": 6.646, - "learning_rate": 3.7e-05, - "inf_nan_count": 0 - }, - { - "step": 11150, - "loss": 6.6852, - "learning_rate": 3.69e-05, - "inf_nan_count": 0 - }, - { - "step": 11175, - "loss": 6.5716, - "learning_rate": 3.68e-05, - "inf_nan_count": 0 - }, - { - "step": 11200, - "loss": 6.6311, - "learning_rate": 3.67e-05, - "inf_nan_count": 0 - }, - { - "step": 11225, - "loss": 6.648, - "learning_rate": 3.66e-05, - "inf_nan_count": 0 - }, - { - "step": 11250, - "loss": 6.6204, - "learning_rate": 3.65e-05, - "inf_nan_count": 0 - }, - { - "step": 11275, - "loss": 6.6551, - "learning_rate": 3.64e-05, - "inf_nan_count": 0 - }, - { - "step": 11300, - "loss": 6.6013, - "learning_rate": 3.63e-05, - "inf_nan_count": 0 - }, - { - "step": 11325, - "loss": 6.6478, - "learning_rate": 3.61e-05, - "inf_nan_count": 0 - }, - { - "step": 11350, - "loss": 6.6938, - "learning_rate": 3.6e-05, - "inf_nan_count": 0 - }, - { - "step": 11375, - "loss": 6.6124, - "learning_rate": 3.59e-05, - "inf_nan_count": 0 - }, - { - "step": 11400, - "loss": 6.6781, - "learning_rate": 3.58e-05, - "inf_nan_count": 0 - }, - { - "step": 11425, - "loss": 6.6317, - "learning_rate": 3.57e-05, - "inf_nan_count": 0 - }, - { - "step": 11450, - "loss": 6.6195, - "learning_rate": 3.56e-05, - "inf_nan_count": 0 - }, - { - "step": 11475, - "loss": 6.5941, - "learning_rate": 3.55e-05, - "inf_nan_count": 0 - }, - { - "step": 11500, - "loss": 6.5808, - "learning_rate": 3.54e-05, - "inf_nan_count": 0 - }, - { - "step": 11525, - "loss": 6.6322, - "learning_rate": 3.53e-05, - "inf_nan_count": 0 - }, - { - "step": 11550, - "loss": 6.6172, - "learning_rate": 3.52e-05, - "inf_nan_count": 0 - }, - { - "step": 11575, - "loss": 6.649, - "learning_rate": 3.51e-05, - "inf_nan_count": 0 - }, - { - "step": 11600, - "loss": 6.605, - "learning_rate": 3.5e-05, - "inf_nan_count": 0 - }, - { - "step": 11625, - "loss": 6.6184, - "learning_rate": 3.49e-05, - "inf_nan_count": 0 - }, - { - "step": 11650, - "loss": 6.5597, - "learning_rate": 3.48e-05, - "inf_nan_count": 0 - }, - { - "step": 11675, - "loss": 6.6285, - "learning_rate": 3.47e-05, - "inf_nan_count": 0 - }, - { - "step": 11700, - "loss": 6.5209, - "learning_rate": 3.46e-05, - "inf_nan_count": 0 - }, - { - "step": 11725, - "loss": 6.5505, - "learning_rate": 3.45e-05, - "inf_nan_count": 0 - }, - { - "step": 11750, - "loss": 6.671, - "learning_rate": 3.44e-05, - "inf_nan_count": 0 - }, - { - "step": 11775, - "loss": 6.6403, - "learning_rate": 3.43e-05, - "inf_nan_count": 0 - }, - { - "step": 11800, - "loss": 6.5738, - "learning_rate": 3.42e-05, - "inf_nan_count": 0 - }, - { - "step": 11825, - "loss": 6.608, - "learning_rate": 3.41e-05, - "inf_nan_count": 0 - }, - { - "step": 11850, - "loss": 6.6406, - "learning_rate": 3.4e-05, - "inf_nan_count": 0 - }, - { - "step": 11875, - "loss": 6.6299, - "learning_rate": 3.39e-05, - "inf_nan_count": 0 - }, - { - "step": 11900, - "loss": 6.5781, - "learning_rate": 3.38e-05, - "inf_nan_count": 0 - }, - { - "step": 11925, - "loss": 6.5003, - "learning_rate": 3.36e-05, - "inf_nan_count": 0 - }, - { - "step": 11950, - "loss": 6.635, - "learning_rate": 3.35e-05, - "inf_nan_count": 0 - }, - { - "step": 11975, - "loss": 6.618, - "learning_rate": 3.34e-05, - "inf_nan_count": 0 - }, - { - "step": 12000, - "loss": 6.6603, - "learning_rate": 3.33e-05, - "inf_nan_count": 0 - }, - { - "step": 12025, - "loss": 6.5507, - "learning_rate": 3.32e-05, - "inf_nan_count": 0 - }, - { - "step": 12050, - "loss": 6.5878, - "learning_rate": 3.31e-05, - "inf_nan_count": 0 - }, - { - "step": 12075, - "loss": 6.5245, - "learning_rate": 3.3e-05, - "inf_nan_count": 0 - }, - { - "step": 12100, - "loss": 6.5629, - "learning_rate": 3.29e-05, - "inf_nan_count": 0 - }, - { - "step": 12125, - "loss": 6.6181, - "learning_rate": 3.28e-05, - "inf_nan_count": 0 - }, - { - "step": 12150, - "loss": 6.578, - "learning_rate": 3.27e-05, - "inf_nan_count": 0 - }, - { - "step": 12175, - "loss": 6.5753, - "learning_rate": 3.26e-05, - "inf_nan_count": 0 - }, - { - "step": 12200, - "loss": 6.6071, - "learning_rate": 3.25e-05, - "inf_nan_count": 0 - }, - { - "step": 12225, - "loss": 6.5885, - "learning_rate": 3.24e-05, - "inf_nan_count": 0 - }, - { - "step": 12250, - "loss": 6.5413, - "learning_rate": 3.23e-05, - "inf_nan_count": 0 - }, - { - "step": 12275, - "loss": 6.6635, - "learning_rate": 3.22e-05, - "inf_nan_count": 0 - }, - { - "step": 12300, - "loss": 6.6304, - "learning_rate": 3.21e-05, - "inf_nan_count": 0 - }, - { - "step": 12325, - "loss": 6.5078, - "learning_rate": 3.2e-05, - "inf_nan_count": 0 - }, - { - "step": 12350, - "loss": 6.5712, - "learning_rate": 3.19e-05, - "inf_nan_count": 0 - }, - { - "step": 12375, - "loss": 6.6284, - "learning_rate": 3.18e-05, - "inf_nan_count": 0 - }, - { - "step": 12400, - "loss": 6.5837, - "learning_rate": 3.17e-05, - "inf_nan_count": 0 - }, - { - "step": 12425, - "loss": 6.5354, - "learning_rate": 3.16e-05, - "inf_nan_count": 0 - }, - { - "step": 12450, - "loss": 6.6125, - "learning_rate": 3.15e-05, - "inf_nan_count": 0 - }, - { - "step": 12475, - "loss": 6.5477, - "learning_rate": 3.14e-05, - "inf_nan_count": 0 - }, - { - "step": 12500, - "loss": 6.5827, - "learning_rate": 3.13e-05, - "inf_nan_count": 0 - }, - { - "step": 12525, - "loss": 6.5874, - "learning_rate": 3.11e-05, - "inf_nan_count": 0 - }, - { - "step": 12550, - "loss": 6.5437, - "learning_rate": 3.1e-05, - "inf_nan_count": 0 - }, - { - "step": 12575, - "loss": 6.582, - "learning_rate": 3.09e-05, - "inf_nan_count": 0 - }, - { - "step": 12600, - "loss": 6.5286, - "learning_rate": 3.08e-05, - "inf_nan_count": 0 - }, - { - "step": 12625, - "loss": 6.5144, - "learning_rate": 3.07e-05, - "inf_nan_count": 0 - }, - { - "step": 12650, - "loss": 6.5327, - "learning_rate": 3.06e-05, - "inf_nan_count": 0 - }, - { - "step": 12675, - "loss": 6.6058, - "learning_rate": 3.05e-05, - "inf_nan_count": 0 - }, - { - "step": 12700, - "loss": 6.5626, - "learning_rate": 3.04e-05, - "inf_nan_count": 0 - }, - { - "step": 12725, - "loss": 6.4589, - "learning_rate": 3.03e-05, - "inf_nan_count": 0 - }, - { - "step": 12750, - "loss": 6.5629, - "learning_rate": 3.02e-05, - "inf_nan_count": 0 - }, - { - "step": 12775, - "loss": 6.4815, - "learning_rate": 3.01e-05, - "inf_nan_count": 0 - }, - { - "step": 12800, - "loss": 6.5651, - "learning_rate": 3e-05, - "inf_nan_count": 0 - }, - { - "step": 12825, - "loss": 6.6164, - "learning_rate": 2.99e-05, - "inf_nan_count": 0 - }, - { - "step": 12850, - "loss": 6.6102, - "learning_rate": 2.98e-05, - "inf_nan_count": 0 - }, - { - "step": 12875, - "loss": 6.4871, - "learning_rate": 2.97e-05, - "inf_nan_count": 0 - }, - { - "step": 12900, - "loss": 6.49, - "learning_rate": 2.96e-05, - "inf_nan_count": 0 - }, - { - "step": 12925, - "loss": 6.6028, - "learning_rate": 2.95e-05, - "inf_nan_count": 0 - }, - { - "step": 12950, - "loss": 6.5509, - "learning_rate": 2.94e-05, - "inf_nan_count": 0 - }, - { - "step": 12975, - "loss": 6.5454, - "learning_rate": 2.93e-05, - "inf_nan_count": 0 - }, - { - "step": 13000, - "loss": 6.5587, - "learning_rate": 2.92e-05, - "inf_nan_count": 0 - }, - { - "step": 13025, - "loss": 6.5862, - "learning_rate": 2.91e-05, - "inf_nan_count": 0 - }, - { - "step": 13050, - "loss": 6.5668, - "learning_rate": 2.9e-05, - "inf_nan_count": 0 - }, - { - "step": 13075, - "loss": 6.522, - "learning_rate": 2.89e-05, - "inf_nan_count": 0 - }, - { - "step": 13100, - "loss": 6.5044, - "learning_rate": 2.87e-05, - "inf_nan_count": 0 - }, - { - "step": 13125, - "loss": 6.6356, - "learning_rate": 2.86e-05, - "inf_nan_count": 0 - }, - { - "step": 13150, - "loss": 6.4772, - "learning_rate": 2.85e-05, - "inf_nan_count": 0 - }, - { - "step": 13175, - "loss": 6.5504, - "learning_rate": 2.84e-05, - "inf_nan_count": 0 - }, - { - "step": 13200, - "loss": 6.5415, - "learning_rate": 2.83e-05, - "inf_nan_count": 0 - }, - { - "step": 13225, - "loss": 6.4651, - "learning_rate": 2.82e-05, - "inf_nan_count": 0 - }, - { - "step": 13250, - "loss": 6.5536, - "learning_rate": 2.81e-05, - "inf_nan_count": 0 - }, - { - "step": 13275, - "loss": 6.4861, - "learning_rate": 2.8e-05, - "inf_nan_count": 0 - }, - { - "step": 13300, - "loss": 6.4688, - "learning_rate": 2.79e-05, - "inf_nan_count": 0 - }, - { - "step": 13325, - "loss": 6.5549, - "learning_rate": 2.78e-05, - "inf_nan_count": 0 - }, - { - "step": 13350, - "loss": 6.4589, - "learning_rate": 2.77e-05, - "inf_nan_count": 0 - }, - { - "step": 13375, - "loss": 6.4644, - "learning_rate": 2.76e-05, - "inf_nan_count": 0 - }, - { - "step": 13400, - "loss": 6.5937, - "learning_rate": 2.75e-05, - "inf_nan_count": 0 - }, - { - "step": 13425, - "loss": 6.5798, - "learning_rate": 2.74e-05, - "inf_nan_count": 0 - }, - { - "step": 13450, - "loss": 6.4615, - "learning_rate": 2.73e-05, - "inf_nan_count": 0 - }, - { - "step": 13475, - "loss": 6.5173, - "learning_rate": 2.72e-05, - "inf_nan_count": 0 - }, - { - "step": 13500, - "loss": 6.4795, - "learning_rate": 2.71e-05, - "inf_nan_count": 0 - }, - { - "step": 13525, - "loss": 6.4789, - "learning_rate": 2.7e-05, - "inf_nan_count": 0 - }, - { - "step": 13550, - "loss": 6.4835, - "learning_rate": 2.69e-05, - "inf_nan_count": 0 - }, - { - "step": 13575, - "loss": 6.5405, - "learning_rate": 2.68e-05, - "inf_nan_count": 0 - }, - { - "step": 13600, - "loss": 6.4616, - "learning_rate": 2.67e-05, - "inf_nan_count": 0 - }, - { - "step": 13625, - "loss": 6.4578, - "learning_rate": 2.66e-05, - "inf_nan_count": 0 - }, - { - "step": 13650, - "loss": 6.4083, - "learning_rate": 2.65e-05, - "inf_nan_count": 0 - }, - { - "step": 13675, - "loss": 6.561, - "learning_rate": 2.64e-05, - "inf_nan_count": 0 - }, - { - "step": 13700, - "loss": 6.5432, - "learning_rate": 2.63e-05, - "inf_nan_count": 0 - }, - { - "step": 13725, - "loss": 6.5119, - "learning_rate": 2.61e-05, - "inf_nan_count": 0 - }, - { - "step": 13750, - "loss": 6.454, - "learning_rate": 2.6e-05, - "inf_nan_count": 0 - }, - { - "step": 13775, - "loss": 6.44, - "learning_rate": 2.59e-05, - "inf_nan_count": 0 - }, - { - "step": 13800, - "loss": 6.4767, - "learning_rate": 2.58e-05, - "inf_nan_count": 0 - }, - { - "step": 13825, - "loss": 6.4765, - "learning_rate": 2.57e-05, - "inf_nan_count": 0 - }, - { - "step": 13850, - "loss": 6.5018, - "learning_rate": 2.56e-05, - "inf_nan_count": 0 - }, - { - "step": 13875, - "loss": 6.5011, - "learning_rate": 2.55e-05, - "inf_nan_count": 0 - }, - { - "step": 13900, - "loss": 6.4283, - "learning_rate": 2.54e-05, - "inf_nan_count": 0 - }, - { - "step": 13925, - "loss": 6.519, - "learning_rate": 2.53e-05, - "inf_nan_count": 0 - }, - { - "step": 13950, - "loss": 6.4388, - "learning_rate": 2.52e-05, - "inf_nan_count": 0 - }, - { - "step": 13975, - "loss": 6.455, - "learning_rate": 2.51e-05, - "inf_nan_count": 0 - }, - { - "step": 14000, - "loss": 6.3491, - "learning_rate": 2.5e-05, - "inf_nan_count": 0 - }, - { - "step": 14025, - "loss": 6.5285, - "learning_rate": 2.49e-05, - "inf_nan_count": 0 - }, - { - "step": 14050, - "loss": 6.5082, - "learning_rate": 2.48e-05, - "inf_nan_count": 0 - }, - { - "step": 14075, - "loss": 6.5451, - "learning_rate": 2.47e-05, - "inf_nan_count": 0 - }, - { - "step": 14100, - "loss": 6.4753, - "learning_rate": 2.46e-05, - "inf_nan_count": 0 - }, - { - "step": 14125, - "loss": 6.6011, - "learning_rate": 2.45e-05, - "inf_nan_count": 0 - }, - { - "step": 14150, - "loss": 6.4885, - "learning_rate": 2.44e-05, - "inf_nan_count": 0 - }, - { - "step": 14175, - "loss": 6.4635, - "learning_rate": 2.43e-05, - "inf_nan_count": 0 - }, - { - "step": 14200, - "loss": 6.5519, - "learning_rate": 2.42e-05, - "inf_nan_count": 0 - }, - { - "step": 14225, - "loss": 6.4356, - "learning_rate": 2.41e-05, - "inf_nan_count": 0 - }, - { - "step": 14250, - "loss": 6.4552, - "learning_rate": 2.4e-05, - "inf_nan_count": 0 - }, - { - "step": 14275, - "loss": 6.4613, - "learning_rate": 2.39e-05, - "inf_nan_count": 0 - }, - { - "step": 14300, - "loss": 6.4411, - "learning_rate": 2.38e-05, - "inf_nan_count": 0 - }, - { - "step": 14325, - "loss": 6.557, - "learning_rate": 2.36e-05, - "inf_nan_count": 0 - }, - { - "step": 14350, - "loss": 6.4476, - "learning_rate": 2.35e-05, - "inf_nan_count": 0 - }, - { - "step": 14375, - "loss": 6.5895, - "learning_rate": 2.34e-05, - "inf_nan_count": 0 - }, - { - "step": 14400, - "loss": 6.4836, - "learning_rate": 2.33e-05, - "inf_nan_count": 0 - }, - { - "step": 14425, - "loss": 6.4175, - "learning_rate": 2.32e-05, - "inf_nan_count": 0 - }, - { - "step": 14450, - "loss": 6.4971, - "learning_rate": 2.31e-05, - "inf_nan_count": 0 - }, - { - "step": 14475, - "loss": 6.4897, - "learning_rate": 2.3e-05, - "inf_nan_count": 0 - }, - { - "step": 14500, - "loss": 6.455, - "learning_rate": 2.29e-05, - "inf_nan_count": 0 - }, - { - "step": 14525, - "loss": 6.4688, - "learning_rate": 2.28e-05, - "inf_nan_count": 0 - }, - { - "step": 14550, - "loss": 6.5494, - "learning_rate": 2.27e-05, - "inf_nan_count": 0 - }, - { - "step": 14575, - "loss": 6.4501, - "learning_rate": 2.26e-05, - "inf_nan_count": 0 - }, - { - "step": 14600, - "loss": 6.5142, - "learning_rate": 2.25e-05, - "inf_nan_count": 0 - }, - { - "step": 14625, - "loss": 6.4891, - "learning_rate": 2.24e-05, - "inf_nan_count": 0 - }, - { - "step": 14650, - "loss": 6.4274, - "learning_rate": 2.23e-05, - "inf_nan_count": 0 - }, - { - "step": 14675, - "loss": 6.5277, - "learning_rate": 2.22e-05, - "inf_nan_count": 0 - }, - { - "step": 14700, - "loss": 6.4472, - "learning_rate": 2.21e-05, - "inf_nan_count": 0 - }, - { - "step": 14725, - "loss": 6.4328, - "learning_rate": 2.2e-05, - "inf_nan_count": 0 - }, - { - "step": 14750, - "loss": 6.4928, - "learning_rate": 2.19e-05, - "inf_nan_count": 0 - }, - { - "step": 14775, - "loss": 6.552, - "learning_rate": 2.18e-05, - "inf_nan_count": 0 - }, - { - "step": 14800, - "loss": 6.5474, - "learning_rate": 2.17e-05, - "inf_nan_count": 0 - }, - { - "step": 14825, - "loss": 6.4394, - "learning_rate": 2.16e-05, - "inf_nan_count": 0 - }, - { - "step": 14850, - "loss": 6.5234, - "learning_rate": 2.15e-05, - "inf_nan_count": 0 - }, - { - "step": 14875, - "loss": 6.4369, - "learning_rate": 2.14e-05, - "inf_nan_count": 0 - }, - { - "step": 14900, - "loss": 6.4694, - "learning_rate": 2.13e-05, - "inf_nan_count": 0 - }, - { - "step": 14925, - "loss": 6.5837, - "learning_rate": 2.11e-05, - "inf_nan_count": 0 - }, - { - "step": 14950, - "loss": 6.4841, - "learning_rate": 2.1e-05, - "inf_nan_count": 0 - }, - { - "step": 14975, - "loss": 6.4347, - "learning_rate": 2.09e-05, - "inf_nan_count": 0 - }, - { - "step": 15000, - "loss": 6.5816, - "learning_rate": 2.08e-05, - "inf_nan_count": 0 - }, - { - "step": 15025, - "loss": 6.5337, - "learning_rate": 2.07e-05, - "inf_nan_count": 0 - }, - { - "step": 15050, - "loss": 6.5131, - "learning_rate": 2.06e-05, - "inf_nan_count": 0 - }, - { - "step": 15075, - "loss": 6.4669, - "learning_rate": 2.05e-05, - "inf_nan_count": 0 - }, - { - "step": 15100, - "loss": 6.5141, - "learning_rate": 2.04e-05, - "inf_nan_count": 0 - }, - { - "step": 15125, - "loss": 6.438, - "learning_rate": 2.03e-05, - "inf_nan_count": 0 - }, - { - "step": 15150, - "loss": 6.4036, - "learning_rate": 2.02e-05, - "inf_nan_count": 0 - }, - { - "step": 15175, - "loss": 6.4517, - "learning_rate": 2.01e-05, - "inf_nan_count": 0 - }, - { - "step": 15200, - "loss": 6.477, - "learning_rate": 2e-05, - "inf_nan_count": 0 - }, - { - "step": 15225, - "loss": 6.4317, - "learning_rate": 1.99e-05, - "inf_nan_count": 0 - }, - { - "step": 15250, - "loss": 6.488, - "learning_rate": 1.98e-05, - "inf_nan_count": 0 - }, - { - "step": 15275, - "loss": 6.4466, - "learning_rate": 1.97e-05, - "inf_nan_count": 0 - }, - { - "step": 15300, - "loss": 6.4248, - "learning_rate": 1.96e-05, - "inf_nan_count": 0 - }, - { - "step": 15325, - "loss": 6.3834, - "learning_rate": 1.95e-05, - "inf_nan_count": 0 - }, - { - "step": 15350, - "loss": 6.4272, - "learning_rate": 1.94e-05, - "inf_nan_count": 0 - }, - { - "step": 15375, - "loss": 6.4834, - "learning_rate": 1.93e-05, - "inf_nan_count": 0 - }, - { - "step": 15400, - "loss": 6.405, - "learning_rate": 1.92e-05, - "inf_nan_count": 0 - }, - { - "step": 15425, - "loss": 6.4264, - "learning_rate": 1.91e-05, - "inf_nan_count": 0 - }, - { - "step": 15450, - "loss": 6.4941, - "learning_rate": 1.9e-05, - "inf_nan_count": 0 - }, - { - "step": 15475, - "loss": 6.4755, - "learning_rate": 1.89e-05, - "inf_nan_count": 0 - }, - { - "step": 15500, - "loss": 6.5459, - "learning_rate": 1.88e-05, - "inf_nan_count": 0 - }, - { - "step": 15525, - "loss": 6.3772, - "learning_rate": 1.86e-05, - "inf_nan_count": 0 - }, - { - "step": 15550, - "loss": 6.443, - "learning_rate": 1.85e-05, - "inf_nan_count": 0 - }, - { - "step": 15575, - "loss": 6.3931, - "learning_rate": 1.84e-05, - "inf_nan_count": 0 - }, - { - "step": 15600, - "loss": 6.4087, - "learning_rate": 1.83e-05, - "inf_nan_count": 0 - }, - { - "step": 15625, - "loss": 6.4743, - "learning_rate": 1.82e-05, - "inf_nan_count": 0 - }, - { - "step": 15650, - "loss": 6.4575, - "learning_rate": 1.81e-05, - "inf_nan_count": 0 - }, - { - "step": 15675, - "loss": 6.4971, - "learning_rate": 1.8e-05, - "inf_nan_count": 0 - }, - { - "step": 15700, - "loss": 6.438, - "learning_rate": 1.79e-05, - "inf_nan_count": 0 - }, - { - "step": 15725, - "loss": 6.5071, - "learning_rate": 1.78e-05, - "inf_nan_count": 0 - }, - { - "step": 15750, - "loss": 6.391, - "learning_rate": 1.77e-05, - "inf_nan_count": 0 - }, - { - "step": 15775, - "loss": 6.4386, - "learning_rate": 1.76e-05, - "inf_nan_count": 0 - }, - { - "step": 15800, - "loss": 6.4268, - "learning_rate": 1.75e-05, - "inf_nan_count": 0 - }, - { - "step": 15825, - "loss": 6.5534, - "learning_rate": 1.74e-05, - "inf_nan_count": 0 - }, - { - "step": 15850, - "loss": 6.4422, - "learning_rate": 1.73e-05, - "inf_nan_count": 0 - }, - { - "step": 15875, - "loss": 6.4075, - "learning_rate": 1.72e-05, - "inf_nan_count": 0 - }, - { - "step": 15900, - "loss": 6.4458, - "learning_rate": 1.71e-05, - "inf_nan_count": 0 - }, - { - "step": 15925, - "loss": 6.3855, - "learning_rate": 1.7e-05, - "inf_nan_count": 0 - }, - { - "step": 15950, - "loss": 6.3659, - "learning_rate": 1.69e-05, - "inf_nan_count": 0 - }, - { - "step": 15975, - "loss": 6.5396, - "learning_rate": 1.68e-05, - "inf_nan_count": 0 - }, - { - "step": 16000, - "loss": 6.4974, - "learning_rate": 1.67e-05, - "inf_nan_count": 0 - }, - { - "step": 16025, - "loss": 6.4785, - "learning_rate": 1.66e-05, - "inf_nan_count": 0 - }, - { - "step": 16050, - "loss": 6.4341, - "learning_rate": 1.65e-05, - "inf_nan_count": 0 - }, - { - "step": 16075, - "loss": 6.3709, - "learning_rate": 1.64e-05, - "inf_nan_count": 0 - }, - { - "step": 16100, - "loss": 6.3707, - "learning_rate": 1.63e-05, - "inf_nan_count": 0 - }, - { - "step": 16125, - "loss": 6.4206, - "learning_rate": 1.61e-05, - "inf_nan_count": 0 - }, - { - "step": 16150, - "loss": 6.397, - "learning_rate": 1.6e-05, - "inf_nan_count": 0 - }, - { - "step": 16175, - "loss": 6.4617, - "learning_rate": 1.59e-05, - "inf_nan_count": 0 - }, - { - "step": 16200, - "loss": 6.5586, - "learning_rate": 1.58e-05, - "inf_nan_count": 0 - }, - { - "step": 16225, - "loss": 6.4248, - "learning_rate": 1.57e-05, - "inf_nan_count": 0 - }, - { - "step": 16250, - "loss": 6.4204, - "learning_rate": 1.56e-05, - "inf_nan_count": 0 - }, - { - "step": 16275, - "loss": 6.4632, - "learning_rate": 1.55e-05, - "inf_nan_count": 0 - }, - { - "step": 16300, - "loss": 6.4491, - "learning_rate": 1.54e-05, - "inf_nan_count": 0 - }, - { - "step": 16325, - "loss": 6.4412, - "learning_rate": 1.53e-05, - "inf_nan_count": 0 - }, - { - "step": 16350, - "loss": 6.4144, - "learning_rate": 1.52e-05, - "inf_nan_count": 0 - }, - { - "step": 16375, - "loss": 6.466, - "learning_rate": 1.51e-05, - "inf_nan_count": 0 - }, - { - "step": 16400, - "loss": 6.4246, - "learning_rate": 1.5e-05, - "inf_nan_count": 0 - }, - { - "step": 16425, - "loss": 6.4571, - "learning_rate": 1.49e-05, - "inf_nan_count": 0 - }, - { - "step": 16450, - "loss": 6.3903, - "learning_rate": 1.48e-05, - "inf_nan_count": 0 - }, - { - "step": 16475, - "loss": 6.4141, - "learning_rate": 1.47e-05, - "inf_nan_count": 0 - }, - { - "step": 16500, - "loss": 6.4467, - "learning_rate": 1.46e-05, - "inf_nan_count": 0 - }, - { - "step": 16525, - "loss": 6.356, - "learning_rate": 1.45e-05, - "inf_nan_count": 0 - }, - { - "step": 16550, - "loss": 6.4049, - "learning_rate": 1.44e-05, - "inf_nan_count": 0 - }, - { - "step": 16575, - "loss": 6.4103, - "learning_rate": 1.43e-05, - "inf_nan_count": 0 - }, - { - "step": 16600, - "loss": 6.4282, - "learning_rate": 1.42e-05, - "inf_nan_count": 0 - }, - { - "step": 16625, - "loss": 6.5397, - "learning_rate": 1.41e-05, - "inf_nan_count": 0 - }, - { - "step": 16650, - "loss": 6.3862, - "learning_rate": 1.4e-05, - "inf_nan_count": 0 - }, - { - "step": 16675, - "loss": 6.4291, - "learning_rate": 1.39e-05, - "inf_nan_count": 0 - }, - { - "step": 16700, - "loss": 6.433, - "learning_rate": 1.38e-05, - "inf_nan_count": 0 - }, - { - "step": 16725, - "loss": 6.3934, - "learning_rate": 1.36e-05, - "inf_nan_count": 0 - }, - { - "step": 16750, - "loss": 6.4042, - "learning_rate": 1.35e-05, - "inf_nan_count": 0 - }, - { - "step": 16775, - "loss": 6.4187, - "learning_rate": 1.34e-05, - "inf_nan_count": 0 - }, - { - "step": 16800, - "loss": 6.4455, - "learning_rate": 1.33e-05, - "inf_nan_count": 0 - }, - { - "step": 16825, - "loss": 6.424, - "learning_rate": 1.32e-05, - "inf_nan_count": 0 - }, - { - "step": 16850, - "loss": 6.4491, - "learning_rate": 1.31e-05, - "inf_nan_count": 0 - }, - { - "step": 16875, - "loss": 6.3993, - "learning_rate": 1.3e-05, - "inf_nan_count": 0 - }, - { - "step": 16900, - "loss": 6.4393, - "learning_rate": 1.29e-05, - "inf_nan_count": 0 - }, - { - "step": 16925, - "loss": 6.3705, - "learning_rate": 1.28e-05, - "inf_nan_count": 0 - }, - { - "step": 16950, - "loss": 6.4404, - "learning_rate": 1.27e-05, - "inf_nan_count": 0 - }, - { - "step": 16975, - "loss": 6.4507, - "learning_rate": 1.26e-05, - "inf_nan_count": 0 - }, - { - "step": 17000, - "loss": 6.3821, - "learning_rate": 1.25e-05, - "inf_nan_count": 0 - }, - { - "step": 17025, - "loss": 6.4234, - "learning_rate": 1.24e-05, - "inf_nan_count": 0 - }, - { - "step": 17050, - "loss": 6.4235, - "learning_rate": 1.23e-05, - "inf_nan_count": 0 - }, - { - "step": 17075, - "loss": 6.4856, - "learning_rate": 1.22e-05, - "inf_nan_count": 0 - }, - { - "step": 17100, - "loss": 6.4877, - "learning_rate": 1.21e-05, - "inf_nan_count": 0 - }, - { - "step": 17125, - "loss": 6.3683, - "learning_rate": 1.2e-05, - "inf_nan_count": 0 - }, - { - "step": 17150, - "loss": 6.4225, - "learning_rate": 1.19e-05, - "inf_nan_count": 0 - }, - { - "step": 17175, - "loss": 6.2573, - "learning_rate": 1.18e-05, - "inf_nan_count": 0 - }, - { - "step": 17200, - "loss": 6.3946, - "learning_rate": 1.17e-05, - "inf_nan_count": 0 - }, - { - "step": 17225, - "loss": 6.4607, - "learning_rate": 1.16e-05, - "inf_nan_count": 0 - }, - { - "step": 17250, - "loss": 6.4407, - "learning_rate": 1.15e-05, - "inf_nan_count": 0 - }, - { - "step": 17275, - "loss": 6.4333, - "learning_rate": 1.14e-05, - "inf_nan_count": 0 - }, - { - "step": 17300, - "loss": 6.3782, - "learning_rate": 1.13e-05, - "inf_nan_count": 0 - }, - { - "step": 17325, - "loss": 6.3665, - "learning_rate": 1.11e-05, - "inf_nan_count": 0 - }, - { - "step": 17350, - "loss": 6.4329, - "learning_rate": 1.1e-05, - "inf_nan_count": 0 - }, - { - "step": 17375, - "loss": 6.5107, - "learning_rate": 1.09e-05, - "inf_nan_count": 0 - }, - { - "step": 17400, - "loss": 6.5076, - "learning_rate": 1.08e-05, - "inf_nan_count": 0 - }, - { - "step": 17425, - "loss": 6.4936, - "learning_rate": 1.07e-05, - "inf_nan_count": 0 - }, - { - "step": 17450, - "loss": 6.4119, - "learning_rate": 1.06e-05, - "inf_nan_count": 0 - }, - { - "step": 17475, - "loss": 6.4032, - "learning_rate": 1.05e-05, - "inf_nan_count": 0 - }, - { - "step": 17500, - "loss": 6.3962, - "learning_rate": 1.04e-05, - "inf_nan_count": 0 - }, - { - "step": 17525, - "loss": 6.4288, - "learning_rate": 1.03e-05, - "inf_nan_count": 0 - }, - { - "step": 17550, - "loss": 6.4021, - "learning_rate": 1.02e-05, - "inf_nan_count": 0 - }, - { - "step": 17575, - "loss": 6.367, - "learning_rate": 1.01e-05, - "inf_nan_count": 0 - }, - { - "step": 17600, - "loss": 6.3904, - "learning_rate": 1e-05, - "inf_nan_count": 0 - }, - { - "step": 17625, - "loss": 6.5059, - "learning_rate": 9.9e-06, - "inf_nan_count": 0 - }, - { - "step": 17650, - "loss": 6.4225, - "learning_rate": 9.79e-06, - "inf_nan_count": 0 - }, - { - "step": 17675, - "loss": 6.4422, - "learning_rate": 9.69e-06, - "inf_nan_count": 0 - }, - { - "step": 17700, - "loss": 6.457, - "learning_rate": 9.58e-06, - "inf_nan_count": 0 - }, - { - "step": 17725, - "loss": 6.4475, - "learning_rate": 9.48e-06, - "inf_nan_count": 0 - }, - { - "step": 17750, - "loss": 6.3786, - "learning_rate": 9.38e-06, - "inf_nan_count": 0 - }, - { - "step": 17775, - "loss": 6.4145, - "learning_rate": 9.27e-06, - "inf_nan_count": 0 - }, - { - "step": 17800, - "loss": 6.3543, - "learning_rate": 9.17e-06, - "inf_nan_count": 0 - }, - { - "step": 17825, - "loss": 6.5116, - "learning_rate": 9.06e-06, - "inf_nan_count": 0 - }, - { - "step": 17850, - "loss": 6.4101, - "learning_rate": 8.96e-06, - "inf_nan_count": 0 - }, - { - "step": 17875, - "loss": 6.4014, - "learning_rate": 8.85e-06, - "inf_nan_count": 0 - }, - { - "step": 17900, - "loss": 6.4216, - "learning_rate": 8.75e-06, - "inf_nan_count": 0 - }, - { - "step": 17925, - "loss": 6.4539, - "learning_rate": 8.65e-06, - "inf_nan_count": 0 - }, - { - "step": 17950, - "loss": 6.4205, - "learning_rate": 8.54e-06, - "inf_nan_count": 0 - }, - { - "step": 17975, - "loss": 6.3865, - "learning_rate": 8.44e-06, - "inf_nan_count": 0 - }, - { - "step": 18000, - "loss": 6.4347, - "learning_rate": 8.33e-06, - "inf_nan_count": 0 - }, - { - "step": 18025, - "loss": 6.4313, - "learning_rate": 8.23e-06, - "inf_nan_count": 0 - }, - { - "step": 18050, - "loss": 6.3868, - "learning_rate": 8.13e-06, - "inf_nan_count": 0 - }, - { - "step": 18075, - "loss": 6.3703, - "learning_rate": 8.02e-06, - "inf_nan_count": 0 - }, - { - "step": 18100, - "loss": 6.3747, - "learning_rate": 7.92e-06, - "inf_nan_count": 0 - }, - { - "step": 18125, - "loss": 6.4228, - "learning_rate": 7.81e-06, - "inf_nan_count": 0 - }, - { - "step": 18150, - "loss": 6.349, - "learning_rate": 7.71e-06, - "inf_nan_count": 0 - }, - { - "step": 18175, - "loss": 6.4522, - "learning_rate": 7.6e-06, - "inf_nan_count": 0 - }, - { - "step": 18200, - "loss": 6.3354, - "learning_rate": 7.5e-06, - "inf_nan_count": 0 - }, - { - "step": 18225, - "loss": 6.4663, - "learning_rate": 7.4e-06, - "inf_nan_count": 0 - }, - { - "step": 18250, - "loss": 6.4155, - "learning_rate": 7.29e-06, - "inf_nan_count": 0 - }, - { - "step": 18275, - "loss": 6.4584, - "learning_rate": 7.19e-06, - "inf_nan_count": 0 - }, - { - "step": 18300, - "loss": 6.3637, - "learning_rate": 7.08e-06, - "inf_nan_count": 0 - }, - { - "step": 18325, - "loss": 6.3583, - "learning_rate": 6.98e-06, - "inf_nan_count": 0 - }, - { - "step": 18350, - "loss": 6.4469, - "learning_rate": 6.88e-06, - "inf_nan_count": 0 - }, - { - "step": 18375, - "loss": 6.3768, - "learning_rate": 6.77e-06, - "inf_nan_count": 0 - }, - { - "step": 18400, - "loss": 6.3179, - "learning_rate": 6.67e-06, - "inf_nan_count": 0 - }, - { - "step": 18425, - "loss": 6.4046, - "learning_rate": 6.56e-06, - "inf_nan_count": 0 - }, - { - "step": 18450, - "loss": 6.3435, - "learning_rate": 6.46e-06, - "inf_nan_count": 0 - }, - { - "step": 18475, - "loss": 6.3454, - "learning_rate": 6.35e-06, - "inf_nan_count": 0 - }, - { - "step": 18500, - "loss": 6.3922, - "learning_rate": 6.25e-06, - "inf_nan_count": 0 - }, - { - "step": 18525, - "loss": 6.3459, - "learning_rate": 6.15e-06, - "inf_nan_count": 0 - }, - { - "step": 18550, - "loss": 6.3591, - "learning_rate": 6.04e-06, - "inf_nan_count": 0 - }, - { - "step": 18575, - "loss": 6.4337, - "learning_rate": 5.94e-06, - "inf_nan_count": 0 - }, - { - "step": 18600, - "loss": 6.3962, - "learning_rate": 5.83e-06, - "inf_nan_count": 0 - }, - { - "step": 18625, - "loss": 6.3425, - "learning_rate": 5.73e-06, - "inf_nan_count": 0 - }, - { - "step": 18650, - "loss": 6.4022, - "learning_rate": 5.63e-06, - "inf_nan_count": 0 - }, - { - "step": 18675, - "loss": 6.4513, - "learning_rate": 5.52e-06, - "inf_nan_count": 0 - }, - { - "step": 18700, - "loss": 6.4284, - "learning_rate": 5.42e-06, - "inf_nan_count": 0 - }, - { - "step": 18725, - "loss": 6.3879, - "learning_rate": 5.31e-06, - "inf_nan_count": 0 - }, - { - "step": 18750, - "loss": 6.4009, - "learning_rate": 5.21e-06, - "inf_nan_count": 0 - }, - { - "step": 18775, - "loss": 6.3713, - "learning_rate": 5.1e-06, - "inf_nan_count": 0 - }, - { - "step": 18800, - "loss": 6.3752, - "learning_rate": 5e-06, - "inf_nan_count": 0 - }, - { - "step": 18825, - "loss": 6.4265, - "learning_rate": 4.9e-06, - "inf_nan_count": 0 - }, - { - "step": 18850, - "loss": 6.3709, - "learning_rate": 4.79e-06, - "inf_nan_count": 0 - }, - { - "step": 18875, - "loss": 6.3316, - "learning_rate": 4.69e-06, - "inf_nan_count": 0 - }, - { - "step": 18900, - "loss": 6.4479, - "learning_rate": 4.58e-06, - "inf_nan_count": 0 - }, - { - "step": 18925, - "loss": 6.4247, - "learning_rate": 4.48e-06, - "inf_nan_count": 0 - }, - { - "step": 18950, - "loss": 6.4126, - "learning_rate": 4.37e-06, - "inf_nan_count": 0 - }, - { - "step": 18975, - "loss": 6.3489, - "learning_rate": 4.27e-06, - "inf_nan_count": 0 - }, - { - "step": 19000, - "loss": 6.325, - "learning_rate": 4.17e-06, - "inf_nan_count": 0 - }, - { - "step": 19025, - "loss": 6.3306, - "learning_rate": 4.06e-06, - "inf_nan_count": 0 - }, - { - "step": 19050, - "loss": 6.387, - "learning_rate": 3.96e-06, - "inf_nan_count": 0 - }, - { - "step": 19075, - "loss": 6.4133, - "learning_rate": 3.85e-06, - "inf_nan_count": 0 - }, - { - "step": 19100, - "loss": 6.334, - "learning_rate": 3.75e-06, - "inf_nan_count": 0 - }, - { - "step": 19125, - "loss": 6.3034, - "learning_rate": 3.65e-06, - "inf_nan_count": 0 - }, - { - "step": 19150, - "loss": 6.4097, - "learning_rate": 3.54e-06, - "inf_nan_count": 0 - }, - { - "step": 19175, - "loss": 6.442, - "learning_rate": 3.44e-06, - "inf_nan_count": 0 - }, - { - "step": 19200, - "loss": 6.3756, - "learning_rate": 3.33e-06, - "inf_nan_count": 0 - }, - { - "step": 19225, - "loss": 6.4037, - "learning_rate": 3.23e-06, - "inf_nan_count": 0 - }, - { - "step": 19250, - "loss": 6.3974, - "learning_rate": 3.13e-06, - "inf_nan_count": 0 - }, - { - "step": 19275, - "loss": 6.3933, - "learning_rate": 3.02e-06, - "inf_nan_count": 0 - }, - { - "step": 19300, - "loss": 6.3269, - "learning_rate": 2.92e-06, - "inf_nan_count": 0 - }, - { - "step": 19325, - "loss": 6.3907, - "learning_rate": 2.81e-06, - "inf_nan_count": 0 - }, - { - "step": 19350, - "loss": 6.3955, - "learning_rate": 2.71e-06, - "inf_nan_count": 0 - }, - { - "step": 19375, - "loss": 6.3972, - "learning_rate": 2.6e-06, - "inf_nan_count": 0 - }, - { - "step": 19400, - "loss": 6.3896, - "learning_rate": 2.5e-06, - "inf_nan_count": 0 - }, - { - "step": 19425, - "loss": 6.3425, - "learning_rate": 2.4e-06, - "inf_nan_count": 0 - }, - { - "step": 19450, - "loss": 6.3587, - "learning_rate": 2.29e-06, - "inf_nan_count": 0 - }, - { - "step": 19475, - "loss": 6.4179, - "learning_rate": 2.19e-06, - "inf_nan_count": 0 - }, - { - "step": 19500, - "loss": 6.4192, - "learning_rate": 2.08e-06, - "inf_nan_count": 0 - }, - { - "step": 19525, - "loss": 6.4252, - "learning_rate": 1.98e-06, - "inf_nan_count": 0 - }, - { - "step": 19550, - "loss": 6.3349, - "learning_rate": 1.88e-06, - "inf_nan_count": 0 - }, - { - "step": 19575, - "loss": 6.4042, - "learning_rate": 1.77e-06, - "inf_nan_count": 0 - }, - { - "step": 19600, - "loss": 6.3567, - "learning_rate": 1.67e-06, - "inf_nan_count": 0 - }, - { - "step": 19625, - "loss": 6.3912, - "learning_rate": 1.56e-06, - "inf_nan_count": 0 - }, - { - "step": 19650, - "loss": 6.3113, - "learning_rate": 1.46e-06, - "inf_nan_count": 0 - }, - { - "step": 19675, - "loss": 6.3756, - "learning_rate": 1.35e-06, - "inf_nan_count": 0 - }, - { - "step": 19700, - "loss": 6.385, - "learning_rate": 1.25e-06, - "inf_nan_count": 0 - }, - { - "step": 19725, - "loss": 6.3631, - "learning_rate": 1.15e-06, - "inf_nan_count": 0 - }, - { - "step": 19750, - "loss": 6.4564, - "learning_rate": 1.04e-06, - "inf_nan_count": 0 - }, - { - "step": 19775, - "loss": 6.3258, - "learning_rate": 9.38e-07, - "inf_nan_count": 0 - }, - { - "step": 19800, - "loss": 6.4682, - "learning_rate": 8.33e-07, - "inf_nan_count": 0 - }, - { - "step": 19825, - "loss": 6.4421, - "learning_rate": 7.29e-07, - "inf_nan_count": 0 - }, - { - "step": 19850, - "loss": 6.4342, - "learning_rate": 6.25e-07, - "inf_nan_count": 0 - }, - { - "step": 19875, - "loss": 6.4182, - "learning_rate": 5.21e-07, - "inf_nan_count": 0 - }, - { - "step": 19900, - "loss": 6.3203, - "learning_rate": 4.17e-07, - "inf_nan_count": 0 - }, - { - "step": 19925, - "loss": 6.4339, - "learning_rate": 3.13e-07, - "inf_nan_count": 0 - }, - { - "step": 19950, - "loss": 6.4095, - "learning_rate": 2.08e-07, - "inf_nan_count": 0 - }, - { - "step": 19975, - "loss": 6.4814, - "learning_rate": 1.04e-07, - "inf_nan_count": 0 - } - ], - "evaluation_results": [ - { - "step": 1000, - "paloma": 7.125172406420199e+27 - }, - { - "step": 1500, - "paloma": 6.5469212698356e+18 - }, - { - "step": 2000, - "paloma": 5.118641309912889e+18 - }, - { - "step": 2500, - "paloma": 3.37924315167126e+18 - }, - { - "step": 3000, - "paloma": 6.892747900243237e+18 - }, - { - "step": 3500, - "paloma": 2.0436832271954907e+19 - }, - { - "step": 4000, - "paloma": 4.1410268232311005e+19 - }, - { - "step": 4500, - "paloma": 3.4524340411684053e+19 - }, - { - "step": 5000, - "paloma": 2.320698426399461e+19 - }, - { - "step": 5500, - "paloma": 3.1834097890526753e+19 - }, - { - "step": 6000, - "paloma": 4.457139025979801e+19 - }, - { - "step": 6500, - "paloma": 7.3062353841856406e+19 - }, - { - "step": 7000, - "paloma": 1.2357969480287024e+20 - }, - { - "step": 7500, - "paloma": 2.7199371732053928e+20 - }, - { - "step": 8000, - "paloma": 7.181862506006892e+20 - }, - { - "step": 8500, - "paloma": 1.5123285241831744e+21 - }, - { - "step": 9000, - "paloma": 3.573074534351724e+21 - }, - { - "step": 9500, - "paloma": 7.403721262078652e+21 - }, - { - "step": 10000, - "paloma": 1.0650515380055143e+22 - }, - { - "step": 10500, - "paloma": 2.1077589258137904e+22 - }, - { - "step": 11000, - "paloma": 2.712416409262884e+22 - }, - { - "step": 11500, - "paloma": 4.877238989481918e+22 - }, - { - "step": 12000, - "paloma": 7.219509956260661e+22 - }, - { - "step": 12500, - "paloma": 1.1729325953411656e+23 - }, - { - "step": 13000, - "paloma": 1.729306754923583e+23 - }, - { - "step": 13500, - "paloma": 2.4018454768029128e+23 - }, - { - "step": 14000, - "paloma": 3.247328955167052e+23 - }, - { - "step": 14500, - "paloma": 4.43239578722337e+23 - }, - { - "step": 15000, - "paloma": 5.215164570276226e+23 - }, - { - "step": 15500, - "paloma": 6.102665947946271e+23 - }, - { - "step": 16000, - "paloma": 8.874629945146669e+23 - }, - { - "step": 16500, - "paloma": 9.981607121011733e+23 - }, - { - "step": 17000, - "paloma": 1.1075349421086151e+24 - }, - { - "step": 17500, - "paloma": 1.1064948792133394e+24 - }, - { - "step": 18000, - "paloma": 1.340918782615931e+24 - }, - { - "step": 18500, - "paloma": 1.4325241176004668e+24 - }, - { - "step": 19000, - "paloma": 1.5360601246943468e+24 - }, - { - "step": 19500, - "paloma": 1.6346615942991742e+24 - }, - { - "step": 20000, - "paloma": 1.645368302099182e+24 - } - ], - "config": { - "d_model": 96, - "n_layers": 12, - "max_seq_len": 2048, - "vocab_size": 50304, - "lr": 5e-05, - "max_steps": 20000, - "batch_size": 1 - } - }, - { - "run_name": "pico-decoder-tiny-dolma29k-v1", - "log_file": "log_20250828_225300.log", - "training_metrics": [ - { - "step": 1000, - "loss": 7.7657, - "learning_rate": 0.00012, - "inf_nan_count": 0 - }, - { - "step": 1100, - "loss": 7.6733, - "learning_rate": 0.000132, - "inf_nan_count": 0 - }, - { - "step": 1200, - "loss": 7.5969, - "learning_rate": 0.000144, - "inf_nan_count": 0 - }, - { - "step": 1300, - "loss": 7.4765, - "learning_rate": 0.000156, - "inf_nan_count": 0 - }, - { - "step": 1400, - "loss": 7.3686, - "learning_rate": 0.000168, - "inf_nan_count": 0 - }, - { - "step": 1500, - "loss": 7.3251, - "learning_rate": 0.00018, - "inf_nan_count": 0 - }, - { - "step": 1600, - "loss": 7.184, - "learning_rate": 0.000192, - "inf_nan_count": 0 - }, - { - "step": 1700, - "loss": 7.1116, - "learning_rate": 0.000204, - "inf_nan_count": 0 - }, - { - "step": 1800, - "loss": 7.0565, - "learning_rate": 0.000216, - "inf_nan_count": 0 - }, - { - "step": 1900, - "loss": 6.9964, - "learning_rate": 0.000228, - "inf_nan_count": 0 - }, - { - "step": 2000, - "loss": 6.969, - "learning_rate": 0.00024, - "inf_nan_count": 0 - }, - { - "step": 2100, - "loss": 6.884, - "learning_rate": 0.000252, - "inf_nan_count": 0 - }, - { - "step": 2200, - "loss": 6.8334, - "learning_rate": 0.000264, - "inf_nan_count": 0 - }, - { - "step": 2300, - "loss": 6.815, - "learning_rate": 0.000276, - "inf_nan_count": 0 - }, - { - "step": 2400, - "loss": 6.7519, - "learning_rate": 0.000288, - "inf_nan_count": 0 - }, - { - "step": 2500, - "loss": 6.6908, - "learning_rate": 0.0003, - "inf_nan_count": 0 - }, - { - "step": 2600, - "loss": 6.6351, - "learning_rate": 0.0003, - "inf_nan_count": 0 - }, - { - "step": 2700, - "loss": 6.5568, - "learning_rate": 0.0003, - "inf_nan_count": 0 - }, - { - "step": 2800, - "loss": 6.5799, - "learning_rate": 0.0003, - "inf_nan_count": 0 - }, - { - "step": 2900, - "loss": 6.5467, - "learning_rate": 0.000299, - "inf_nan_count": 0 - }, - { - "step": 3000, - "loss": 6.4865, - "learning_rate": 0.000299, - "inf_nan_count": 0 - }, - { - "step": 3100, - "loss": 6.4604, - "learning_rate": 0.000299, - "inf_nan_count": 0 - }, - { - "step": 3200, - "loss": 6.4205, - "learning_rate": 0.000299, - "inf_nan_count": 0 - }, - { - "step": 3300, - "loss": 6.4127, - "learning_rate": 0.000299, - "inf_nan_count": 0 - }, - { - "step": 3400, - "loss": 6.3692, - "learning_rate": 0.000299, - "inf_nan_count": 0 - }, - { - "step": 3500, - "loss": 6.3761, - "learning_rate": 0.000298, - "inf_nan_count": 0 - }, - { - "step": 3600, - "loss": 6.2796, - "learning_rate": 0.000298, - "inf_nan_count": 0 - }, - { - "step": 3700, - "loss": 6.2988, - "learning_rate": 0.000298, - "inf_nan_count": 0 - }, - { - "step": 3800, - "loss": 6.2673, - "learning_rate": 0.000298, - "inf_nan_count": 0 - }, - { - "step": 3900, - "loss": 6.2715, - "learning_rate": 0.000298, - "inf_nan_count": 0 - }, - { - "step": 4000, - "loss": 6.189, - "learning_rate": 0.000298, - "inf_nan_count": 0 - }, - { - "step": 4100, - "loss": 6.1832, - "learning_rate": 0.000298, - "inf_nan_count": 0 - }, - { - "step": 4200, - "loss": 6.1553, - "learning_rate": 0.000297, - "inf_nan_count": 0 - }, - { - "step": 4300, - "loss": 6.1629, - "learning_rate": 0.000297, - "inf_nan_count": 0 - }, - { - "step": 4400, - "loss": 6.1061, - "learning_rate": 0.000297, - "inf_nan_count": 0 - }, - { - "step": 4500, - "loss": 6.1601, - "learning_rate": 0.000297, - "inf_nan_count": 0 - }, - { - "step": 4600, - "loss": 6.0963, - "learning_rate": 0.000297, - "inf_nan_count": 0 - }, - { - "step": 4700, - "loss": 6.078, - "learning_rate": 0.000297, - "inf_nan_count": 0 - }, - { - "step": 4800, - "loss": 6.0835, - "learning_rate": 0.000297, - "inf_nan_count": 0 - }, - { - "step": 4900, - "loss": 6.0519, - "learning_rate": 0.000296, - "inf_nan_count": 0 - }, - { - "step": 5000, - "loss": 6.0661, - "learning_rate": 0.000296, - "inf_nan_count": 0 - }, - { - "step": 5100, - "loss": 6.0121, - "learning_rate": 0.000296, - "inf_nan_count": 0 - }, - { - "step": 5200, - "loss": 6.0544, - "learning_rate": 0.000296, - "inf_nan_count": 0 - }, - { - "step": 5300, - "loss": 6.0224, - "learning_rate": 0.000296, - "inf_nan_count": 0 - }, - { - "step": 5400, - "loss": 5.9831, - "learning_rate": 0.000296, - "inf_nan_count": 0 - }, - { - "step": 5500, - "loss": 5.9553, - "learning_rate": 0.000295, - "inf_nan_count": 0 - }, - { - "step": 5600, - "loss": 5.9493, - "learning_rate": 0.000295, - "inf_nan_count": 0 - }, - { - "step": 5700, - "loss": 5.9943, - "learning_rate": 0.000295, - "inf_nan_count": 0 - }, - { - "step": 5800, - "loss": 5.963, - "learning_rate": 0.000295, - "inf_nan_count": 0 - }, - { - "step": 5900, - "loss": 5.9349, - "learning_rate": 0.000295, - "inf_nan_count": 0 - }, - { - "step": 6000, - "loss": 5.9087, - "learning_rate": 0.000295, - "inf_nan_count": 0 - }, - { - "step": 6100, - "loss": 5.8818, - "learning_rate": 0.000295, - "inf_nan_count": 0 - }, - { - "step": 6200, - "loss": 5.8535, - "learning_rate": 0.000294, - "inf_nan_count": 0 - }, - { - "step": 6300, - "loss": 5.8896, - "learning_rate": 0.000294, - "inf_nan_count": 0 - }, - { - "step": 6400, - "loss": 5.9007, - "learning_rate": 0.000294, - "inf_nan_count": 0 - }, - { - "step": 6500, - "loss": 5.8617, - "learning_rate": 0.000294, - "inf_nan_count": 0 - }, - { - "step": 6600, - "loss": 5.8201, - "learning_rate": 0.000294, - "inf_nan_count": 0 - }, - { - "step": 6700, - "loss": 5.8544, - "learning_rate": 0.000294, - "inf_nan_count": 0 - }, - { - "step": 6800, - "loss": 5.8532, - "learning_rate": 0.000293, - "inf_nan_count": 0 - }, - { - "step": 6900, - "loss": 5.795, - "learning_rate": 0.000293, - "inf_nan_count": 0 - }, - { - "step": 7000, - "loss": 5.8146, - "learning_rate": 0.000293, - "inf_nan_count": 0 - }, - { - "step": 7100, - "loss": 5.793, - "learning_rate": 0.000293, - "inf_nan_count": 0 - }, - { - "step": 7200, - "loss": 5.7827, - "learning_rate": 0.000293, - "inf_nan_count": 0 - }, - { - "step": 7300, - "loss": 5.7816, - "learning_rate": 0.000293, - "inf_nan_count": 0 - }, - { - "step": 7400, - "loss": 5.73, - "learning_rate": 0.000293, - "inf_nan_count": 0 - }, - { - "step": 7500, - "loss": 5.767, - "learning_rate": 0.000292, - "inf_nan_count": 0 - }, - { - "step": 7600, - "loss": 5.745, - "learning_rate": 0.000292, - "inf_nan_count": 0 - }, - { - "step": 7700, - "loss": 5.7499, - "learning_rate": 0.000292, - "inf_nan_count": 0 - }, - { - "step": 7800, - "loss": 5.7233, - "learning_rate": 0.000292, - "inf_nan_count": 0 - }, - { - "step": 7900, - "loss": 5.7219, - "learning_rate": 0.000292, - "inf_nan_count": 0 - }, - { - "step": 8000, - "loss": 5.7523, - "learning_rate": 0.000292, - "inf_nan_count": 0 - }, - { - "step": 8100, - "loss": 5.7145, - "learning_rate": 0.000291, - "inf_nan_count": 0 - }, - { - "step": 8200, - "loss": 5.7469, - "learning_rate": 0.000291, - "inf_nan_count": 0 - }, - { - "step": 8300, - "loss": 5.7363, - "learning_rate": 0.000291, - "inf_nan_count": 0 - }, - { - "step": 8400, - "loss": 5.6938, - "learning_rate": 0.000291, - "inf_nan_count": 0 - }, - { - "step": 8500, - "loss": 5.6994, - "learning_rate": 0.000291, - "inf_nan_count": 0 - }, - { - "step": 8600, - "loss": 5.6583, - "learning_rate": 0.000291, - "inf_nan_count": 0 - }, - { - "step": 8700, - "loss": 5.6885, - "learning_rate": 0.000291, - "inf_nan_count": 0 - }, - { - "step": 8800, - "loss": 5.6313, - "learning_rate": 0.00029, - "inf_nan_count": 0 - }, - { - "step": 8900, - "loss": 5.6314, - "learning_rate": 0.00029, - "inf_nan_count": 0 - }, - { - "step": 9000, - "loss": 5.6501, - "learning_rate": 0.00029, - "inf_nan_count": 0 - }, - { - "step": 9100, - "loss": 5.6357, - "learning_rate": 0.00029, - "inf_nan_count": 0 - }, - { - "step": 9200, - "loss": 5.6045, - "learning_rate": 0.00029, - "inf_nan_count": 0 - }, - { - "step": 9300, - "loss": 5.6405, - "learning_rate": 0.00029, - "inf_nan_count": 0 - }, - { - "step": 9400, - "loss": 5.6241, - "learning_rate": 0.00029, - "inf_nan_count": 0 - }, - { - "step": 9500, - "loss": 5.6247, - "learning_rate": 0.000289, - "inf_nan_count": 0 - }, - { - "step": 9600, - "loss": 5.5983, - "learning_rate": 0.000289, - "inf_nan_count": 0 - }, - { - "step": 9700, - "loss": 5.5978, - "learning_rate": 0.000289, - "inf_nan_count": 0 - }, - { - "step": 9800, - "loss": 5.5746, - "learning_rate": 0.000289, - "inf_nan_count": 0 - } - ], - "evaluation_results": [ - { - "step": 1000, - "paloma": 2.5468931158531133e+19 - }, - { - "step": 2000, - "paloma": 3.627192449295412e+21 - }, - { - "step": 3000, - "paloma": 9.90975658825673e+22 - }, - { - "step": 4000, - "paloma": 2.6252526658823776e+24 - }, - { - "step": 5000, - "paloma": 7.294956881845611e+25 - }, - { - "step": 6000, - "paloma": 1.6856570425562805e+27 - }, - { - "step": 7000, - "paloma": 9.22180682233585e+28 - }, - { - "step": 8000, - "paloma": 3.1300823362207656e+29 - }, - { - "step": 9000, - "paloma": 4.983924509492406e+30 - } - ], - "config": { - "d_model": 96, - "n_layers": 12, - "max_seq_len": 2048, - "vocab_size": 50304, - "lr": 0.0003, - "max_steps": 200000, - "batch_size": 1 - } - }, - { - "run_name": "pico-decoder-tiny-dolma-teensy-v0", - "log_file": "log_20250828_210922.log", - "training_metrics": [ - { - "step": 0, - "loss": 10.9914, - "learning_rate": 0.0, - "inf_nan_count": 0 - } - ], - "evaluation_results": [ - { - "step": 0, - "paloma": 59434.76600609756 - }, - { - "step": 27, - "paloma": 59120.39268292683 - } - ], - "config": { - "d_model": 96, - "n_layers": 12, - "max_seq_len": 2048, - "vocab_size": 50304, - "lr": 0.0003, - "max_steps": 200000, - "batch_size": 8 - } - }, - { - "run_name": "pico-decoder-tiny-dolma-teensy-v1", - "log_file": "log_20250828_220514.log", - "training_metrics": [ - { - "step": 0, - "loss": 10.9886, - "learning_rate": 0.0, - "inf_nan_count": 0 - }, - { - "step": 100, - "loss": 10.9373, - "learning_rate": 1.2e-05, - "inf_nan_count": 0 - }, - { - "step": 200, - "loss": 10.5423, - "learning_rate": 2.4e-05, - "inf_nan_count": 0 - }, - { - "step": 300, - "loss": 9.9452, - "learning_rate": 3.6e-05, - "inf_nan_count": 0 - }, - { - "step": 400, - "loss": 9.449, - "learning_rate": 4.8e-05, - "inf_nan_count": 0 - }, - { - "step": 500, - "loss": 8.8455, - "learning_rate": 6e-05, - "inf_nan_count": 0 - }, - { - "step": 600, - "loss": 8.1482, - "learning_rate": 7.2e-05, - "inf_nan_count": 0 - }, - { - "step": 700, - "loss": 7.4303, - "learning_rate": 8.4e-05, - "inf_nan_count": 0 - }, - { - "step": 800, - "loss": 7.0363, - "learning_rate": 9.6e-05, - "inf_nan_count": 0 - }, - { - "step": 900, - "loss": 6.9702, - "learning_rate": 0.000108, - "inf_nan_count": 0 - }, - { - "step": 1000, - "loss": 6.8975, - "learning_rate": 0.00012, - "inf_nan_count": 0 - }, - { - "step": 1100, - "loss": 6.892, - "learning_rate": 0.000132, - "inf_nan_count": 0 - }, - { - "step": 1200, - "loss": 6.6684, - "learning_rate": 0.000144, - "inf_nan_count": 0 - }, - { - "step": 1300, - "loss": 6.4754, - "learning_rate": 0.000156, - "inf_nan_count": 0 - }, - { - "step": 1400, - "loss": 6.3649, - "learning_rate": 0.000168, - "inf_nan_count": 0 - }, - { - "step": 1500, - "loss": 6.2981, - "learning_rate": 0.00018, - "inf_nan_count": 0 - }, - { - "step": 1600, - "loss": 6.1551, - "learning_rate": 0.000192, - "inf_nan_count": 0 - }, - { - "step": 1700, - "loss": 5.9163, - "learning_rate": 0.000204, - "inf_nan_count": 0 - } - ], - "evaluation_results": [ - { - "step": 1000, - "paloma": 9.54583880403771e+19 - }, - { - "step": 1755, - "paloma": 2.945795672816324e+21 - } - ], - "config": { - "d_model": 96, - "n_layers": 12, - "max_seq_len": 2048, - "vocab_size": 50304, - "lr": 0.0003, - "max_steps": 200000, - "batch_size": 4 - } - }, - { - "run_name": "pico-decoder-tiny-dolma5M-v1", - "log_file": "log_20250830_014108.log", - "training_metrics": [ - { - "step": 32000, - "loss": 6.3376, - "learning_rate": 7.32e-06, - "inf_nan_count": 0 - }, - { - "step": 32025, - "loss": 6.1999, - "learning_rate": 7.28e-06, - "inf_nan_count": 0 - }, - { - "step": 32050, - "loss": 6.1488, - "learning_rate": 7.24e-06, - "inf_nan_count": 0 - }, - { - "step": 32075, - "loss": 6.046, - "learning_rate": 7.19e-06, - "inf_nan_count": 0 - } - ], - "evaluation_results": [ - { - "step": 32000, - "paloma": 2.977755235898109e+26 - } - ], - "config": { - "d_model": 96, - "n_layers": 12, - "max_seq_len": 2048, - "vocab_size": 50304, - "lr": 5e-05, - "max_steps": 20000, - "batch_size": 1 - } - } - ], - "summary": { - "total_runs": 6, - "run_names": [ - "pico-decoder-tiny-dolma29k-v2", - "pico-decoder-tiny-dolma29k-v3", - "pico-decoder-tiny-dolma29k-v1", - "pico-decoder-tiny-dolma-teensy-v0", - "pico-decoder-tiny-dolma-teensy-v1", - "pico-decoder-tiny-dolma5M-v1" - ] - } -} \ No newline at end of file