ThomasTheMaker's picture
Upload folder using huggingface_hub
feba2ad verified
raw
history blame
145 kB
{
"runs": [
{
"run_name": "pico-decoder-tiny-dolma29k-v2",
"log_file": "log_20250829_003838.log",
"training_metrics": [
{
"step": 0,
"loss": 10.9848,
"learning_rate": 0.0,
"inf_nan_count": 0
},
{
"step": 50,
"loss": 11.0005,
"learning_rate": 1e-06,
"inf_nan_count": 0
},
{
"step": 100,
"loss": 10.9918,
"learning_rate": 2e-06,
"inf_nan_count": 0
},
{
"step": 150,
"loss": 10.9776,
"learning_rate": 3e-06,
"inf_nan_count": 0
},
{
"step": 200,
"loss": 10.9569,
"learning_rate": 4e-06,
"inf_nan_count": 0
},
{
"step": 250,
"loss": 10.9255,
"learning_rate": 5e-06,
"inf_nan_count": 0
},
{
"step": 300,
"loss": 10.8883,
"learning_rate": 6e-06,
"inf_nan_count": 0
},
{
"step": 350,
"loss": 10.8249,
"learning_rate": 7e-06,
"inf_nan_count": 0
},
{
"step": 400,
"loss": 10.7344,
"learning_rate": 8e-06,
"inf_nan_count": 0
},
{
"step": 450,
"loss": 10.6177,
"learning_rate": 9e-06,
"inf_nan_count": 0
},
{
"step": 500,
"loss": 10.5025,
"learning_rate": 1e-05,
"inf_nan_count": 0
},
{
"step": 550,
"loss": 10.3986,
"learning_rate": 1.1e-05,
"inf_nan_count": 0
},
{
"step": 600,
"loss": 10.3079,
"learning_rate": 1.2e-05,
"inf_nan_count": 0
},
{
"step": 650,
"loss": 10.2142,
"learning_rate": 1.3e-05,
"inf_nan_count": 0
},
{
"step": 700,
"loss": 10.1146,
"learning_rate": 1.4e-05,
"inf_nan_count": 0
},
{
"step": 750,
"loss": 10.0398,
"learning_rate": 1.5e-05,
"inf_nan_count": 0
},
{
"step": 800,
"loss": 9.9311,
"learning_rate": 1.6e-05,
"inf_nan_count": 0
},
{
"step": 850,
"loss": 9.8431,
"learning_rate": 1.7e-05,
"inf_nan_count": 0
},
{
"step": 900,
"loss": 9.7453,
"learning_rate": 1.8e-05,
"inf_nan_count": 0
},
{
"step": 950,
"loss": 9.6527,
"learning_rate": 1.9e-05,
"inf_nan_count": 0
},
{
"step": 1000,
"loss": 9.5691,
"learning_rate": 2e-05,
"inf_nan_count": 0
},
{
"step": 1050,
"loss": 9.46,
"learning_rate": 2.1e-05,
"inf_nan_count": 0
},
{
"step": 1100,
"loss": 9.3525,
"learning_rate": 2.2e-05,
"inf_nan_count": 0
},
{
"step": 1150,
"loss": 9.2715,
"learning_rate": 2.3e-05,
"inf_nan_count": 0
},
{
"step": 1200,
"loss": 9.1618,
"learning_rate": 2.4e-05,
"inf_nan_count": 0
},
{
"step": 1250,
"loss": 9.0547,
"learning_rate": 2.5e-05,
"inf_nan_count": 0
},
{
"step": 1300,
"loss": 8.955,
"learning_rate": 2.6e-05,
"inf_nan_count": 0
},
{
"step": 1350,
"loss": 8.8251,
"learning_rate": 2.7e-05,
"inf_nan_count": 0
},
{
"step": 1400,
"loss": 8.7711,
"learning_rate": 2.8e-05,
"inf_nan_count": 0
},
{
"step": 1450,
"loss": 8.6834,
"learning_rate": 2.9e-05,
"inf_nan_count": 0
},
{
"step": 1500,
"loss": 8.5638,
"learning_rate": 3e-05,
"inf_nan_count": 0
},
{
"step": 1550,
"loss": 8.4572,
"learning_rate": 3.1e-05,
"inf_nan_count": 0
},
{
"step": 1600,
"loss": 8.394,
"learning_rate": 3.2e-05,
"inf_nan_count": 0
},
{
"step": 1650,
"loss": 8.2973,
"learning_rate": 3.3e-05,
"inf_nan_count": 0
},
{
"step": 1700,
"loss": 8.2264,
"learning_rate": 3.4e-05,
"inf_nan_count": 0
},
{
"step": 1750,
"loss": 8.1672,
"learning_rate": 3.5e-05,
"inf_nan_count": 0
},
{
"step": 1800,
"loss": 8.0695,
"learning_rate": 3.6e-05,
"inf_nan_count": 0
},
{
"step": 1850,
"loss": 8.0299,
"learning_rate": 3.7e-05,
"inf_nan_count": 0
},
{
"step": 1900,
"loss": 7.9883,
"learning_rate": 3.8e-05,
"inf_nan_count": 0
},
{
"step": 1950,
"loss": 7.9429,
"learning_rate": 3.9e-05,
"inf_nan_count": 0
},
{
"step": 2000,
"loss": 7.8447,
"learning_rate": 4e-05,
"inf_nan_count": 0
},
{
"step": 2050,
"loss": 7.838,
"learning_rate": 4.1e-05,
"inf_nan_count": 0
},
{
"step": 2100,
"loss": 7.7671,
"learning_rate": 4.2e-05,
"inf_nan_count": 0
},
{
"step": 2150,
"loss": 7.7637,
"learning_rate": 4.3e-05,
"inf_nan_count": 0
},
{
"step": 2200,
"loss": 7.706,
"learning_rate": 4.4e-05,
"inf_nan_count": 0
},
{
"step": 2250,
"loss": 7.7607,
"learning_rate": 4.5e-05,
"inf_nan_count": 0
},
{
"step": 2300,
"loss": 7.7076,
"learning_rate": 4.6e-05,
"inf_nan_count": 0
},
{
"step": 2350,
"loss": 7.6787,
"learning_rate": 4.7e-05,
"inf_nan_count": 0
},
{
"step": 2400,
"loss": 7.6446,
"learning_rate": 4.8e-05,
"inf_nan_count": 0
},
{
"step": 2450,
"loss": 7.5999,
"learning_rate": 4.9e-05,
"inf_nan_count": 0
},
{
"step": 2500,
"loss": 7.6154,
"learning_rate": 5e-05,
"inf_nan_count": 0
},
{
"step": 2550,
"loss": 7.5627,
"learning_rate": 5.1e-05,
"inf_nan_count": 0
},
{
"step": 2600,
"loss": 7.5747,
"learning_rate": 5.2e-05,
"inf_nan_count": 0
},
{
"step": 2650,
"loss": 7.5358,
"learning_rate": 5.3e-05,
"inf_nan_count": 0
},
{
"step": 2700,
"loss": 7.5148,
"learning_rate": 5.4e-05,
"inf_nan_count": 0
},
{
"step": 2750,
"loss": 7.4874,
"learning_rate": 5.5e-05,
"inf_nan_count": 0
},
{
"step": 2800,
"loss": 7.4438,
"learning_rate": 5.6e-05,
"inf_nan_count": 0
},
{
"step": 2850,
"loss": 7.4772,
"learning_rate": 5.7e-05,
"inf_nan_count": 0
},
{
"step": 2900,
"loss": 7.4135,
"learning_rate": 5.8e-05,
"inf_nan_count": 0
},
{
"step": 2950,
"loss": 7.3929,
"learning_rate": 5.9e-05,
"inf_nan_count": 0
},
{
"step": 3000,
"loss": 7.3566,
"learning_rate": 6e-05,
"inf_nan_count": 0
},
{
"step": 3050,
"loss": 7.3318,
"learning_rate": 6.1e-05,
"inf_nan_count": 0
},
{
"step": 3100,
"loss": 7.3114,
"learning_rate": 6.2e-05,
"inf_nan_count": 0
},
{
"step": 3150,
"loss": 7.2734,
"learning_rate": 6.3e-05,
"inf_nan_count": 0
},
{
"step": 3200,
"loss": 7.322,
"learning_rate": 6.4e-05,
"inf_nan_count": 0
},
{
"step": 3250,
"loss": 7.2621,
"learning_rate": 6.5e-05,
"inf_nan_count": 0
},
{
"step": 3300,
"loss": 7.2257,
"learning_rate": 6.6e-05,
"inf_nan_count": 0
},
{
"step": 3350,
"loss": 7.2447,
"learning_rate": 6.7e-05,
"inf_nan_count": 0
},
{
"step": 3400,
"loss": 7.2344,
"learning_rate": 6.8e-05,
"inf_nan_count": 0
},
{
"step": 3450,
"loss": 7.1488,
"learning_rate": 6.9e-05,
"inf_nan_count": 0
},
{
"step": 3500,
"loss": 7.1797,
"learning_rate": 7e-05,
"inf_nan_count": 0
},
{
"step": 3550,
"loss": 7.1737,
"learning_rate": 7.1e-05,
"inf_nan_count": 0
},
{
"step": 3600,
"loss": 7.1204,
"learning_rate": 7.2e-05,
"inf_nan_count": 0
},
{
"step": 3650,
"loss": 7.1102,
"learning_rate": 7.3e-05,
"inf_nan_count": 0
},
{
"step": 3700,
"loss": 7.0845,
"learning_rate": 7.4e-05,
"inf_nan_count": 0
},
{
"step": 3750,
"loss": 7.0858,
"learning_rate": 7.5e-05,
"inf_nan_count": 0
},
{
"step": 3800,
"loss": 7.0362,
"learning_rate": 7.6e-05,
"inf_nan_count": 0
},
{
"step": 3850,
"loss": 7.0603,
"learning_rate": 7.7e-05,
"inf_nan_count": 0
},
{
"step": 3900,
"loss": 7.0172,
"learning_rate": 7.8e-05,
"inf_nan_count": 0
},
{
"step": 3950,
"loss": 6.9948,
"learning_rate": 7.9e-05,
"inf_nan_count": 0
},
{
"step": 4000,
"loss": 6.9909,
"learning_rate": 8e-05,
"inf_nan_count": 0
},
{
"step": 4050,
"loss": 6.9477,
"learning_rate": 8.1e-05,
"inf_nan_count": 0
},
{
"step": 4100,
"loss": 6.9651,
"learning_rate": 8.2e-05,
"inf_nan_count": 0
},
{
"step": 4150,
"loss": 6.9149,
"learning_rate": 8.3e-05,
"inf_nan_count": 0
},
{
"step": 4200,
"loss": 6.893,
"learning_rate": 8.4e-05,
"inf_nan_count": 0
},
{
"step": 4250,
"loss": 6.9227,
"learning_rate": 8.5e-05,
"inf_nan_count": 0
},
{
"step": 4300,
"loss": 6.879,
"learning_rate": 8.6e-05,
"inf_nan_count": 0
},
{
"step": 4350,
"loss": 6.8649,
"learning_rate": 8.7e-05,
"inf_nan_count": 0
},
{
"step": 4400,
"loss": 6.8305,
"learning_rate": 8.8e-05,
"inf_nan_count": 0
},
{
"step": 4450,
"loss": 6.8085,
"learning_rate": 8.9e-05,
"inf_nan_count": 0
},
{
"step": 4500,
"loss": 6.8315,
"learning_rate": 9e-05,
"inf_nan_count": 0
},
{
"step": 4550,
"loss": 6.7885,
"learning_rate": 9.1e-05,
"inf_nan_count": 0
},
{
"step": 4600,
"loss": 6.7805,
"learning_rate": 9.2e-05,
"inf_nan_count": 0
},
{
"step": 4650,
"loss": 6.7737,
"learning_rate": 9.3e-05,
"inf_nan_count": 0
},
{
"step": 4700,
"loss": 6.7649,
"learning_rate": 9.4e-05,
"inf_nan_count": 0
},
{
"step": 4750,
"loss": 6.7562,
"learning_rate": 9.5e-05,
"inf_nan_count": 0
},
{
"step": 4800,
"loss": 6.7347,
"learning_rate": 9.6e-05,
"inf_nan_count": 0
},
{
"step": 4850,
"loss": 6.7161,
"learning_rate": 9.7e-05,
"inf_nan_count": 0
},
{
"step": 4900,
"loss": 6.6889,
"learning_rate": 9.8e-05,
"inf_nan_count": 0
},
{
"step": 4950,
"loss": 6.7299,
"learning_rate": 9.9e-05,
"inf_nan_count": 0
},
{
"step": 5000,
"loss": 6.6605,
"learning_rate": 0.0001,
"inf_nan_count": 0
},
{
"step": 5050,
"loss": 6.6552,
"learning_rate": 0.0001,
"inf_nan_count": 0
},
{
"step": 5100,
"loss": 6.7038,
"learning_rate": 9.99e-05,
"inf_nan_count": 0
},
{
"step": 5150,
"loss": 6.6452,
"learning_rate": 9.99e-05,
"inf_nan_count": 0
},
{
"step": 5200,
"loss": 6.6522,
"learning_rate": 9.99e-05,
"inf_nan_count": 0
},
{
"step": 5250,
"loss": 6.627,
"learning_rate": 9.99e-05,
"inf_nan_count": 0
},
{
"step": 5300,
"loss": 6.5733,
"learning_rate": 9.98e-05,
"inf_nan_count": 0
},
{
"step": 5350,
"loss": 6.5833,
"learning_rate": 9.98e-05,
"inf_nan_count": 0
},
{
"step": 5400,
"loss": 6.5854,
"learning_rate": 9.98e-05,
"inf_nan_count": 0
},
{
"step": 5450,
"loss": 6.6012,
"learning_rate": 9.98e-05,
"inf_nan_count": 0
},
{
"step": 5500,
"loss": 6.5786,
"learning_rate": 9.97e-05,
"inf_nan_count": 0
}
],
"evaluation_results": [
{
"step": 1000,
"paloma": 5.073320568651489e+18
},
{
"step": 2000,
"paloma": 1.8978577072995303e+19
},
{
"step": 3000,
"paloma": 3.1701596694317715e+19
},
{
"step": 4000,
"paloma": 2.5015965971757485e+20
},
{
"step": 5000,
"paloma": 2.38712860824014e+21
}
],
"config": {
"d_model": 96,
"n_layers": 12,
"max_seq_len": 2048,
"vocab_size": 50304,
"lr": 0.0001,
"max_steps": 200000,
"batch_size": 1
}
},
{
"run_name": "pico-decoder-tiny-dolma29k-v3",
"log_file": "log_20250829_020629.log",
"training_metrics": [
{
"step": 500,
"loss": 10.8854,
"learning_rate": 3.13e-06,
"inf_nan_count": 0
},
{
"step": 525,
"loss": 10.889,
"learning_rate": 3.28e-06,
"inf_nan_count": 0
},
{
"step": 550,
"loss": 10.8846,
"learning_rate": 3.44e-06,
"inf_nan_count": 0
},
{
"step": 575,
"loss": 10.8657,
"learning_rate": 3.59e-06,
"inf_nan_count": 0
},
{
"step": 600,
"loss": 10.859,
"learning_rate": 3.75e-06,
"inf_nan_count": 0
},
{
"step": 625,
"loss": 10.8328,
"learning_rate": 3.91e-06,
"inf_nan_count": 0
},
{
"step": 650,
"loss": 10.8166,
"learning_rate": 4.06e-06,
"inf_nan_count": 0
},
{
"step": 675,
"loss": 10.7913,
"learning_rate": 4.22e-06,
"inf_nan_count": 0
},
{
"step": 700,
"loss": 10.7609,
"learning_rate": 4.37e-06,
"inf_nan_count": 0
},
{
"step": 725,
"loss": 10.7322,
"learning_rate": 4.53e-06,
"inf_nan_count": 0
},
{
"step": 750,
"loss": 10.7121,
"learning_rate": 4.69e-06,
"inf_nan_count": 0
},
{
"step": 775,
"loss": 10.6877,
"learning_rate": 4.84e-06,
"inf_nan_count": 0
},
{
"step": 800,
"loss": 10.6436,
"learning_rate": 5e-06,
"inf_nan_count": 0
},
{
"step": 825,
"loss": 10.6256,
"learning_rate": 5.16e-06,
"inf_nan_count": 0
},
{
"step": 850,
"loss": 10.5961,
"learning_rate": 5.31e-06,
"inf_nan_count": 0
},
{
"step": 875,
"loss": 10.5443,
"learning_rate": 5.47e-06,
"inf_nan_count": 0
},
{
"step": 900,
"loss": 10.5197,
"learning_rate": 5.63e-06,
"inf_nan_count": 0
},
{
"step": 925,
"loss": 10.4854,
"learning_rate": 5.78e-06,
"inf_nan_count": 0
},
{
"step": 950,
"loss": 10.4826,
"learning_rate": 5.94e-06,
"inf_nan_count": 0
},
{
"step": 975,
"loss": 10.4557,
"learning_rate": 6.09e-06,
"inf_nan_count": 0
},
{
"step": 1000,
"loss": 10.4142,
"learning_rate": 6.25e-06,
"inf_nan_count": 0
},
{
"step": 1025,
"loss": 10.3885,
"learning_rate": 6.41e-06,
"inf_nan_count": 0
},
{
"step": 1050,
"loss": 10.3737,
"learning_rate": 6.56e-06,
"inf_nan_count": 0
},
{
"step": 1075,
"loss": 10.3534,
"learning_rate": 6.72e-06,
"inf_nan_count": 0
},
{
"step": 1100,
"loss": 10.3219,
"learning_rate": 6.88e-06,
"inf_nan_count": 0
},
{
"step": 1125,
"loss": 10.3064,
"learning_rate": 7.03e-06,
"inf_nan_count": 0
},
{
"step": 1150,
"loss": 10.2761,
"learning_rate": 7.19e-06,
"inf_nan_count": 0
},
{
"step": 1175,
"loss": 10.2592,
"learning_rate": 7.34e-06,
"inf_nan_count": 0
},
{
"step": 1200,
"loss": 10.242,
"learning_rate": 7.5e-06,
"inf_nan_count": 0
},
{
"step": 1225,
"loss": 10.2141,
"learning_rate": 7.66e-06,
"inf_nan_count": 0
},
{
"step": 1250,
"loss": 10.1882,
"learning_rate": 7.81e-06,
"inf_nan_count": 0
},
{
"step": 1275,
"loss": 10.1608,
"learning_rate": 7.97e-06,
"inf_nan_count": 0
},
{
"step": 1300,
"loss": 10.146,
"learning_rate": 8.13e-06,
"inf_nan_count": 0
},
{
"step": 1325,
"loss": 10.0944,
"learning_rate": 8.28e-06,
"inf_nan_count": 0
},
{
"step": 1350,
"loss": 10.0885,
"learning_rate": 8.44e-06,
"inf_nan_count": 0
},
{
"step": 1375,
"loss": 10.0748,
"learning_rate": 8.59e-06,
"inf_nan_count": 0
},
{
"step": 1400,
"loss": 10.0425,
"learning_rate": 8.75e-06,
"inf_nan_count": 0
},
{
"step": 1425,
"loss": 10.0422,
"learning_rate": 8.91e-06,
"inf_nan_count": 0
},
{
"step": 1450,
"loss": 10.0039,
"learning_rate": 9.06e-06,
"inf_nan_count": 0
},
{
"step": 1475,
"loss": 9.9736,
"learning_rate": 9.22e-06,
"inf_nan_count": 0
},
{
"step": 1500,
"loss": 9.9729,
"learning_rate": 9.38e-06,
"inf_nan_count": 0
},
{
"step": 1525,
"loss": 9.9379,
"learning_rate": 9.53e-06,
"inf_nan_count": 0
},
{
"step": 1550,
"loss": 9.8819,
"learning_rate": 9.69e-06,
"inf_nan_count": 0
},
{
"step": 1575,
"loss": 9.8702,
"learning_rate": 9.84e-06,
"inf_nan_count": 0
},
{
"step": 1600,
"loss": 9.8571,
"learning_rate": 1e-05,
"inf_nan_count": 0
},
{
"step": 1625,
"loss": 9.8356,
"learning_rate": 1.02e-05,
"inf_nan_count": 0
},
{
"step": 1650,
"loss": 9.7973,
"learning_rate": 1.03e-05,
"inf_nan_count": 0
},
{
"step": 1675,
"loss": 9.7745,
"learning_rate": 1.05e-05,
"inf_nan_count": 0
},
{
"step": 1700,
"loss": 9.7673,
"learning_rate": 1.06e-05,
"inf_nan_count": 0
},
{
"step": 1725,
"loss": 9.7406,
"learning_rate": 1.08e-05,
"inf_nan_count": 0
},
{
"step": 1750,
"loss": 9.7312,
"learning_rate": 1.09e-05,
"inf_nan_count": 0
},
{
"step": 1775,
"loss": 9.6563,
"learning_rate": 1.11e-05,
"inf_nan_count": 0
},
{
"step": 1800,
"loss": 9.6515,
"learning_rate": 1.13e-05,
"inf_nan_count": 0
},
{
"step": 1825,
"loss": 9.6241,
"learning_rate": 1.14e-05,
"inf_nan_count": 0
},
{
"step": 1850,
"loss": 9.6015,
"learning_rate": 1.16e-05,
"inf_nan_count": 0
},
{
"step": 1875,
"loss": 9.5933,
"learning_rate": 1.17e-05,
"inf_nan_count": 0
},
{
"step": 1900,
"loss": 9.5544,
"learning_rate": 1.19e-05,
"inf_nan_count": 0
},
{
"step": 1925,
"loss": 9.5407,
"learning_rate": 1.2e-05,
"inf_nan_count": 0
},
{
"step": 1950,
"loss": 9.5431,
"learning_rate": 1.22e-05,
"inf_nan_count": 0
},
{
"step": 1975,
"loss": 9.4853,
"learning_rate": 1.23e-05,
"inf_nan_count": 0
},
{
"step": 2000,
"loss": 9.4665,
"learning_rate": 1.25e-05,
"inf_nan_count": 0
},
{
"step": 2025,
"loss": 9.4621,
"learning_rate": 1.27e-05,
"inf_nan_count": 0
},
{
"step": 2050,
"loss": 9.4031,
"learning_rate": 1.28e-05,
"inf_nan_count": 0
},
{
"step": 2075,
"loss": 9.3699,
"learning_rate": 1.3e-05,
"inf_nan_count": 0
},
{
"step": 2100,
"loss": 9.3422,
"learning_rate": 1.31e-05,
"inf_nan_count": 0
},
{
"step": 2125,
"loss": 9.3129,
"learning_rate": 1.33e-05,
"inf_nan_count": 0
},
{
"step": 2150,
"loss": 9.2917,
"learning_rate": 1.34e-05,
"inf_nan_count": 0
},
{
"step": 2175,
"loss": 9.267,
"learning_rate": 1.36e-05,
"inf_nan_count": 0
},
{
"step": 2200,
"loss": 9.2512,
"learning_rate": 1.38e-05,
"inf_nan_count": 0
},
{
"step": 2225,
"loss": 9.2737,
"learning_rate": 1.39e-05,
"inf_nan_count": 0
},
{
"step": 2250,
"loss": 9.2357,
"learning_rate": 1.41e-05,
"inf_nan_count": 0
},
{
"step": 2275,
"loss": 9.1471,
"learning_rate": 1.42e-05,
"inf_nan_count": 0
},
{
"step": 2300,
"loss": 9.1305,
"learning_rate": 1.44e-05,
"inf_nan_count": 0
},
{
"step": 2325,
"loss": 9.143,
"learning_rate": 1.45e-05,
"inf_nan_count": 0
},
{
"step": 2350,
"loss": 9.0948,
"learning_rate": 1.47e-05,
"inf_nan_count": 0
},
{
"step": 2375,
"loss": 9.0256,
"learning_rate": 1.48e-05,
"inf_nan_count": 0
},
{
"step": 2400,
"loss": 9.0664,
"learning_rate": 1.5e-05,
"inf_nan_count": 0
},
{
"step": 2425,
"loss": 9.002,
"learning_rate": 1.52e-05,
"inf_nan_count": 0
},
{
"step": 2450,
"loss": 8.9518,
"learning_rate": 1.53e-05,
"inf_nan_count": 0
},
{
"step": 2475,
"loss": 8.9717,
"learning_rate": 1.55e-05,
"inf_nan_count": 0
},
{
"step": 2500,
"loss": 8.9536,
"learning_rate": 1.56e-05,
"inf_nan_count": 0
},
{
"step": 2525,
"loss": 8.8812,
"learning_rate": 1.58e-05,
"inf_nan_count": 0
},
{
"step": 2550,
"loss": 8.8824,
"learning_rate": 1.59e-05,
"inf_nan_count": 0
},
{
"step": 2575,
"loss": 8.8564,
"learning_rate": 1.61e-05,
"inf_nan_count": 0
},
{
"step": 2600,
"loss": 8.8419,
"learning_rate": 1.63e-05,
"inf_nan_count": 0
},
{
"step": 2625,
"loss": 8.7865,
"learning_rate": 1.64e-05,
"inf_nan_count": 0
},
{
"step": 2650,
"loss": 8.7493,
"learning_rate": 1.66e-05,
"inf_nan_count": 0
},
{
"step": 2675,
"loss": 8.7255,
"learning_rate": 1.67e-05,
"inf_nan_count": 0
},
{
"step": 2700,
"loss": 8.6469,
"learning_rate": 1.69e-05,
"inf_nan_count": 0
},
{
"step": 2725,
"loss": 8.6799,
"learning_rate": 1.7e-05,
"inf_nan_count": 0
},
{
"step": 2750,
"loss": 8.6974,
"learning_rate": 1.72e-05,
"inf_nan_count": 0
},
{
"step": 2775,
"loss": 8.6441,
"learning_rate": 1.73e-05,
"inf_nan_count": 0
},
{
"step": 2800,
"loss": 8.6689,
"learning_rate": 1.75e-05,
"inf_nan_count": 0
},
{
"step": 2825,
"loss": 8.5732,
"learning_rate": 1.77e-05,
"inf_nan_count": 0
},
{
"step": 2850,
"loss": 8.5955,
"learning_rate": 1.78e-05,
"inf_nan_count": 0
},
{
"step": 2875,
"loss": 8.5823,
"learning_rate": 1.8e-05,
"inf_nan_count": 0
},
{
"step": 2900,
"loss": 8.5968,
"learning_rate": 1.81e-05,
"inf_nan_count": 0
},
{
"step": 2925,
"loss": 8.4721,
"learning_rate": 1.83e-05,
"inf_nan_count": 0
},
{
"step": 2950,
"loss": 8.4672,
"learning_rate": 1.84e-05,
"inf_nan_count": 0
},
{
"step": 2975,
"loss": 8.4033,
"learning_rate": 1.86e-05,
"inf_nan_count": 0
},
{
"step": 3000,
"loss": 8.4947,
"learning_rate": 1.88e-05,
"inf_nan_count": 0
},
{
"step": 3025,
"loss": 8.378,
"learning_rate": 1.89e-05,
"inf_nan_count": 0
},
{
"step": 3050,
"loss": 8.3581,
"learning_rate": 1.91e-05,
"inf_nan_count": 0
},
{
"step": 3075,
"loss": 8.3341,
"learning_rate": 1.92e-05,
"inf_nan_count": 0
},
{
"step": 3100,
"loss": 8.3391,
"learning_rate": 1.94e-05,
"inf_nan_count": 0
},
{
"step": 3125,
"loss": 8.367,
"learning_rate": 1.95e-05,
"inf_nan_count": 0
},
{
"step": 3150,
"loss": 8.237,
"learning_rate": 1.97e-05,
"inf_nan_count": 0
},
{
"step": 3175,
"loss": 8.2879,
"learning_rate": 1.98e-05,
"inf_nan_count": 0
},
{
"step": 3200,
"loss": 8.2706,
"learning_rate": 2e-05,
"inf_nan_count": 0
},
{
"step": 3225,
"loss": 8.1983,
"learning_rate": 2.02e-05,
"inf_nan_count": 0
},
{
"step": 3250,
"loss": 8.2174,
"learning_rate": 2.03e-05,
"inf_nan_count": 0
},
{
"step": 3275,
"loss": 8.2229,
"learning_rate": 2.05e-05,
"inf_nan_count": 0
},
{
"step": 3300,
"loss": 8.1398,
"learning_rate": 2.06e-05,
"inf_nan_count": 0
},
{
"step": 3325,
"loss": 8.143,
"learning_rate": 2.08e-05,
"inf_nan_count": 0
},
{
"step": 3350,
"loss": 8.1471,
"learning_rate": 2.09e-05,
"inf_nan_count": 0
},
{
"step": 3375,
"loss": 8.0908,
"learning_rate": 2.11e-05,
"inf_nan_count": 0
},
{
"step": 3400,
"loss": 8.1165,
"learning_rate": 2.13e-05,
"inf_nan_count": 0
},
{
"step": 3425,
"loss": 8.0957,
"learning_rate": 2.14e-05,
"inf_nan_count": 0
},
{
"step": 3450,
"loss": 8.1115,
"learning_rate": 2.16e-05,
"inf_nan_count": 0
},
{
"step": 3475,
"loss": 8.0623,
"learning_rate": 2.17e-05,
"inf_nan_count": 0
},
{
"step": 3500,
"loss": 8.0527,
"learning_rate": 2.19e-05,
"inf_nan_count": 0
},
{
"step": 3525,
"loss": 7.9975,
"learning_rate": 2.2e-05,
"inf_nan_count": 0
},
{
"step": 3550,
"loss": 7.9881,
"learning_rate": 2.22e-05,
"inf_nan_count": 0
},
{
"step": 3575,
"loss": 8.006,
"learning_rate": 2.23e-05,
"inf_nan_count": 0
},
{
"step": 3600,
"loss": 7.9366,
"learning_rate": 2.25e-05,
"inf_nan_count": 0
},
{
"step": 3625,
"loss": 8.0252,
"learning_rate": 2.27e-05,
"inf_nan_count": 0
},
{
"step": 3650,
"loss": 7.916,
"learning_rate": 2.28e-05,
"inf_nan_count": 0
},
{
"step": 3675,
"loss": 7.947,
"learning_rate": 2.3e-05,
"inf_nan_count": 0
},
{
"step": 3700,
"loss": 7.8943,
"learning_rate": 2.31e-05,
"inf_nan_count": 0
},
{
"step": 3725,
"loss": 7.8951,
"learning_rate": 2.33e-05,
"inf_nan_count": 0
},
{
"step": 3750,
"loss": 7.9316,
"learning_rate": 2.34e-05,
"inf_nan_count": 0
},
{
"step": 3775,
"loss": 7.9407,
"learning_rate": 2.36e-05,
"inf_nan_count": 0
},
{
"step": 3800,
"loss": 7.9385,
"learning_rate": 2.38e-05,
"inf_nan_count": 0
},
{
"step": 3825,
"loss": 7.88,
"learning_rate": 2.39e-05,
"inf_nan_count": 0
},
{
"step": 3850,
"loss": 7.9207,
"learning_rate": 2.41e-05,
"inf_nan_count": 0
},
{
"step": 3875,
"loss": 7.8258,
"learning_rate": 2.42e-05,
"inf_nan_count": 0
},
{
"step": 3900,
"loss": 7.9005,
"learning_rate": 2.44e-05,
"inf_nan_count": 0
},
{
"step": 3925,
"loss": 7.8232,
"learning_rate": 2.45e-05,
"inf_nan_count": 0
},
{
"step": 3950,
"loss": 7.7847,
"learning_rate": 2.47e-05,
"inf_nan_count": 0
},
{
"step": 3975,
"loss": 7.7909,
"learning_rate": 2.48e-05,
"inf_nan_count": 0
},
{
"step": 4000,
"loss": 7.7419,
"learning_rate": 2.5e-05,
"inf_nan_count": 0
},
{
"step": 4025,
"loss": 7.8031,
"learning_rate": 2.52e-05,
"inf_nan_count": 0
},
{
"step": 4050,
"loss": 7.7948,
"learning_rate": 2.53e-05,
"inf_nan_count": 0
},
{
"step": 4075,
"loss": 7.7259,
"learning_rate": 2.55e-05,
"inf_nan_count": 0
},
{
"step": 4100,
"loss": 7.8406,
"learning_rate": 2.56e-05,
"inf_nan_count": 0
},
{
"step": 4125,
"loss": 7.7938,
"learning_rate": 2.58e-05,
"inf_nan_count": 0
},
{
"step": 4150,
"loss": 7.7101,
"learning_rate": 2.59e-05,
"inf_nan_count": 0
},
{
"step": 4175,
"loss": 7.6633,
"learning_rate": 2.61e-05,
"inf_nan_count": 0
},
{
"step": 4200,
"loss": 7.683,
"learning_rate": 2.63e-05,
"inf_nan_count": 0
},
{
"step": 4225,
"loss": 7.7106,
"learning_rate": 2.64e-05,
"inf_nan_count": 0
},
{
"step": 4250,
"loss": 7.7174,
"learning_rate": 2.66e-05,
"inf_nan_count": 0
},
{
"step": 4275,
"loss": 7.7508,
"learning_rate": 2.67e-05,
"inf_nan_count": 0
},
{
"step": 4300,
"loss": 7.6831,
"learning_rate": 2.69e-05,
"inf_nan_count": 0
},
{
"step": 4325,
"loss": 7.6498,
"learning_rate": 2.7e-05,
"inf_nan_count": 0
},
{
"step": 4350,
"loss": 7.6668,
"learning_rate": 2.72e-05,
"inf_nan_count": 0
},
{
"step": 4375,
"loss": 7.6852,
"learning_rate": 2.73e-05,
"inf_nan_count": 0
},
{
"step": 4400,
"loss": 7.6469,
"learning_rate": 2.75e-05,
"inf_nan_count": 0
},
{
"step": 4425,
"loss": 7.7448,
"learning_rate": 2.77e-05,
"inf_nan_count": 0
},
{
"step": 4450,
"loss": 7.7422,
"learning_rate": 2.78e-05,
"inf_nan_count": 0
},
{
"step": 4475,
"loss": 7.6918,
"learning_rate": 2.8e-05,
"inf_nan_count": 0
},
{
"step": 4500,
"loss": 7.7084,
"learning_rate": 2.81e-05,
"inf_nan_count": 0
},
{
"step": 4525,
"loss": 7.722,
"learning_rate": 2.83e-05,
"inf_nan_count": 0
},
{
"step": 4550,
"loss": 7.6893,
"learning_rate": 2.84e-05,
"inf_nan_count": 0
},
{
"step": 4575,
"loss": 7.6454,
"learning_rate": 2.86e-05,
"inf_nan_count": 0
},
{
"step": 4600,
"loss": 7.6298,
"learning_rate": 2.87e-05,
"inf_nan_count": 0
},
{
"step": 4625,
"loss": 7.642,
"learning_rate": 2.89e-05,
"inf_nan_count": 0
},
{
"step": 4650,
"loss": 7.6247,
"learning_rate": 2.91e-05,
"inf_nan_count": 0
},
{
"step": 4675,
"loss": 7.6448,
"learning_rate": 2.92e-05,
"inf_nan_count": 0
},
{
"step": 4700,
"loss": 7.6506,
"learning_rate": 2.94e-05,
"inf_nan_count": 0
},
{
"step": 4725,
"loss": 7.6356,
"learning_rate": 2.95e-05,
"inf_nan_count": 0
},
{
"step": 4750,
"loss": 7.6426,
"learning_rate": 2.97e-05,
"inf_nan_count": 0
},
{
"step": 4775,
"loss": 7.6388,
"learning_rate": 2.98e-05,
"inf_nan_count": 0
},
{
"step": 4800,
"loss": 7.5216,
"learning_rate": 3e-05,
"inf_nan_count": 0
},
{
"step": 4825,
"loss": 7.5367,
"learning_rate": 3.02e-05,
"inf_nan_count": 0
},
{
"step": 4850,
"loss": 7.5084,
"learning_rate": 3.03e-05,
"inf_nan_count": 0
},
{
"step": 4875,
"loss": 7.6092,
"learning_rate": 3.05e-05,
"inf_nan_count": 0
},
{
"step": 4900,
"loss": 7.576,
"learning_rate": 3.06e-05,
"inf_nan_count": 0
},
{
"step": 4925,
"loss": 7.5686,
"learning_rate": 3.08e-05,
"inf_nan_count": 0
},
{
"step": 4950,
"loss": 7.5583,
"learning_rate": 3.09e-05,
"inf_nan_count": 0
},
{
"step": 4975,
"loss": 7.5818,
"learning_rate": 3.11e-05,
"inf_nan_count": 0
},
{
"step": 5000,
"loss": 7.6004,
"learning_rate": 3.13e-05,
"inf_nan_count": 0
},
{
"step": 5025,
"loss": 7.5371,
"learning_rate": 3.14e-05,
"inf_nan_count": 0
},
{
"step": 5050,
"loss": 7.5179,
"learning_rate": 3.16e-05,
"inf_nan_count": 0
},
{
"step": 5075,
"loss": 7.5255,
"learning_rate": 3.17e-05,
"inf_nan_count": 0
},
{
"step": 5100,
"loss": 7.5155,
"learning_rate": 3.19e-05,
"inf_nan_count": 0
},
{
"step": 5125,
"loss": 7.566,
"learning_rate": 3.2e-05,
"inf_nan_count": 0
},
{
"step": 5150,
"loss": 7.4797,
"learning_rate": 3.22e-05,
"inf_nan_count": 0
},
{
"step": 5175,
"loss": 7.6224,
"learning_rate": 3.23e-05,
"inf_nan_count": 0
},
{
"step": 5200,
"loss": 7.4821,
"learning_rate": 3.25e-05,
"inf_nan_count": 0
},
{
"step": 5225,
"loss": 7.4765,
"learning_rate": 3.27e-05,
"inf_nan_count": 0
},
{
"step": 5250,
"loss": 7.468,
"learning_rate": 3.28e-05,
"inf_nan_count": 0
},
{
"step": 5275,
"loss": 7.5165,
"learning_rate": 3.3e-05,
"inf_nan_count": 0
},
{
"step": 5300,
"loss": 7.5334,
"learning_rate": 3.31e-05,
"inf_nan_count": 0
},
{
"step": 5325,
"loss": 7.5053,
"learning_rate": 3.33e-05,
"inf_nan_count": 0
},
{
"step": 5350,
"loss": 7.5115,
"learning_rate": 3.34e-05,
"inf_nan_count": 0
},
{
"step": 5375,
"loss": 7.4736,
"learning_rate": 3.36e-05,
"inf_nan_count": 0
},
{
"step": 5400,
"loss": 7.452,
"learning_rate": 3.38e-05,
"inf_nan_count": 0
},
{
"step": 5425,
"loss": 7.4596,
"learning_rate": 3.39e-05,
"inf_nan_count": 0
},
{
"step": 5450,
"loss": 7.4518,
"learning_rate": 3.41e-05,
"inf_nan_count": 0
},
{
"step": 5475,
"loss": 7.4308,
"learning_rate": 3.42e-05,
"inf_nan_count": 0
},
{
"step": 5500,
"loss": 7.4627,
"learning_rate": 3.44e-05,
"inf_nan_count": 0
},
{
"step": 5525,
"loss": 7.4095,
"learning_rate": 3.45e-05,
"inf_nan_count": 0
},
{
"step": 5550,
"loss": 7.4423,
"learning_rate": 3.47e-05,
"inf_nan_count": 0
},
{
"step": 5575,
"loss": 7.46,
"learning_rate": 3.48e-05,
"inf_nan_count": 0
},
{
"step": 5600,
"loss": 7.3457,
"learning_rate": 3.5e-05,
"inf_nan_count": 0
},
{
"step": 5625,
"loss": 7.4838,
"learning_rate": 3.52e-05,
"inf_nan_count": 0
},
{
"step": 5650,
"loss": 7.4556,
"learning_rate": 3.53e-05,
"inf_nan_count": 0
},
{
"step": 5675,
"loss": 7.422,
"learning_rate": 3.55e-05,
"inf_nan_count": 0
},
{
"step": 5700,
"loss": 7.4307,
"learning_rate": 3.56e-05,
"inf_nan_count": 0
},
{
"step": 5725,
"loss": 7.3795,
"learning_rate": 3.58e-05,
"inf_nan_count": 0
},
{
"step": 5750,
"loss": 7.3855,
"learning_rate": 3.59e-05,
"inf_nan_count": 0
},
{
"step": 5775,
"loss": 7.3518,
"learning_rate": 3.61e-05,
"inf_nan_count": 0
},
{
"step": 5800,
"loss": 7.3794,
"learning_rate": 3.63e-05,
"inf_nan_count": 0
},
{
"step": 5825,
"loss": 7.3591,
"learning_rate": 3.64e-05,
"inf_nan_count": 0
},
{
"step": 5850,
"loss": 7.3489,
"learning_rate": 3.66e-05,
"inf_nan_count": 0
},
{
"step": 5875,
"loss": 7.4108,
"learning_rate": 3.67e-05,
"inf_nan_count": 0
},
{
"step": 5900,
"loss": 7.358,
"learning_rate": 3.69e-05,
"inf_nan_count": 0
},
{
"step": 5925,
"loss": 7.3131,
"learning_rate": 3.7e-05,
"inf_nan_count": 0
},
{
"step": 5950,
"loss": 7.2905,
"learning_rate": 3.72e-05,
"inf_nan_count": 0
},
{
"step": 5975,
"loss": 7.3466,
"learning_rate": 3.73e-05,
"inf_nan_count": 0
},
{
"step": 6000,
"loss": 7.3765,
"learning_rate": 3.75e-05,
"inf_nan_count": 0
},
{
"step": 6025,
"loss": 7.287,
"learning_rate": 3.77e-05,
"inf_nan_count": 0
},
{
"step": 6050,
"loss": 7.3333,
"learning_rate": 3.78e-05,
"inf_nan_count": 0
},
{
"step": 6075,
"loss": 7.3098,
"learning_rate": 3.8e-05,
"inf_nan_count": 0
},
{
"step": 6100,
"loss": 7.2594,
"learning_rate": 3.81e-05,
"inf_nan_count": 0
},
{
"step": 6125,
"loss": 7.3327,
"learning_rate": 3.83e-05,
"inf_nan_count": 0
},
{
"step": 6150,
"loss": 7.303,
"learning_rate": 3.84e-05,
"inf_nan_count": 0
},
{
"step": 6175,
"loss": 7.2523,
"learning_rate": 3.86e-05,
"inf_nan_count": 0
},
{
"step": 6200,
"loss": 7.2546,
"learning_rate": 3.87e-05,
"inf_nan_count": 0
},
{
"step": 6225,
"loss": 7.3242,
"learning_rate": 3.89e-05,
"inf_nan_count": 0
},
{
"step": 6250,
"loss": 7.2035,
"learning_rate": 3.91e-05,
"inf_nan_count": 0
},
{
"step": 6275,
"loss": 7.2334,
"learning_rate": 3.92e-05,
"inf_nan_count": 0
},
{
"step": 6300,
"loss": 7.2295,
"learning_rate": 3.94e-05,
"inf_nan_count": 0
},
{
"step": 6325,
"loss": 7.3051,
"learning_rate": 3.95e-05,
"inf_nan_count": 0
},
{
"step": 6350,
"loss": 7.3188,
"learning_rate": 3.97e-05,
"inf_nan_count": 0
},
{
"step": 6375,
"loss": 7.3212,
"learning_rate": 3.98e-05,
"inf_nan_count": 0
},
{
"step": 6400,
"loss": 7.2465,
"learning_rate": 4e-05,
"inf_nan_count": 0
},
{
"step": 6425,
"loss": 7.2081,
"learning_rate": 4.02e-05,
"inf_nan_count": 0
},
{
"step": 6450,
"loss": 7.2852,
"learning_rate": 4.03e-05,
"inf_nan_count": 0
},
{
"step": 6475,
"loss": 7.2074,
"learning_rate": 4.05e-05,
"inf_nan_count": 0
},
{
"step": 6500,
"loss": 7.252,
"learning_rate": 4.06e-05,
"inf_nan_count": 0
},
{
"step": 6525,
"loss": 7.2115,
"learning_rate": 4.08e-05,
"inf_nan_count": 0
},
{
"step": 6550,
"loss": 7.2435,
"learning_rate": 4.09e-05,
"inf_nan_count": 0
},
{
"step": 6575,
"loss": 7.1962,
"learning_rate": 4.11e-05,
"inf_nan_count": 0
},
{
"step": 6600,
"loss": 7.1631,
"learning_rate": 4.12e-05,
"inf_nan_count": 0
},
{
"step": 6625,
"loss": 7.2525,
"learning_rate": 4.14e-05,
"inf_nan_count": 0
},
{
"step": 6650,
"loss": 7.2133,
"learning_rate": 4.16e-05,
"inf_nan_count": 0
},
{
"step": 6675,
"loss": 7.2248,
"learning_rate": 4.17e-05,
"inf_nan_count": 0
},
{
"step": 6700,
"loss": 7.1928,
"learning_rate": 4.19e-05,
"inf_nan_count": 0
},
{
"step": 6725,
"loss": 7.1698,
"learning_rate": 4.2e-05,
"inf_nan_count": 0
},
{
"step": 6750,
"loss": 7.3037,
"learning_rate": 4.22e-05,
"inf_nan_count": 0
},
{
"step": 6775,
"loss": 7.2451,
"learning_rate": 4.23e-05,
"inf_nan_count": 0
},
{
"step": 6800,
"loss": 7.1373,
"learning_rate": 4.25e-05,
"inf_nan_count": 0
},
{
"step": 6825,
"loss": 7.139,
"learning_rate": 4.27e-05,
"inf_nan_count": 0
},
{
"step": 6850,
"loss": 7.1296,
"learning_rate": 4.28e-05,
"inf_nan_count": 0
},
{
"step": 6875,
"loss": 7.0961,
"learning_rate": 4.3e-05,
"inf_nan_count": 0
},
{
"step": 6900,
"loss": 7.1408,
"learning_rate": 4.31e-05,
"inf_nan_count": 0
},
{
"step": 6925,
"loss": 7.1852,
"learning_rate": 4.33e-05,
"inf_nan_count": 0
},
{
"step": 6950,
"loss": 7.2067,
"learning_rate": 4.34e-05,
"inf_nan_count": 0
},
{
"step": 6975,
"loss": 7.0681,
"learning_rate": 4.36e-05,
"inf_nan_count": 0
},
{
"step": 7000,
"loss": 7.1813,
"learning_rate": 4.37e-05,
"inf_nan_count": 0
},
{
"step": 7025,
"loss": 7.1992,
"learning_rate": 4.39e-05,
"inf_nan_count": 0
},
{
"step": 7050,
"loss": 7.1409,
"learning_rate": 4.41e-05,
"inf_nan_count": 0
},
{
"step": 7075,
"loss": 7.1271,
"learning_rate": 4.42e-05,
"inf_nan_count": 0
},
{
"step": 7100,
"loss": 7.172,
"learning_rate": 4.44e-05,
"inf_nan_count": 0
},
{
"step": 7125,
"loss": 7.1515,
"learning_rate": 4.45e-05,
"inf_nan_count": 0
},
{
"step": 7150,
"loss": 7.0898,
"learning_rate": 4.47e-05,
"inf_nan_count": 0
},
{
"step": 7175,
"loss": 7.0996,
"learning_rate": 4.48e-05,
"inf_nan_count": 0
},
{
"step": 7200,
"loss": 7.061,
"learning_rate": 4.5e-05,
"inf_nan_count": 0
},
{
"step": 7225,
"loss": 7.1939,
"learning_rate": 4.52e-05,
"inf_nan_count": 0
},
{
"step": 7250,
"loss": 7.0355,
"learning_rate": 4.53e-05,
"inf_nan_count": 0
},
{
"step": 7275,
"loss": 7.0935,
"learning_rate": 4.55e-05,
"inf_nan_count": 0
},
{
"step": 7300,
"loss": 7.0689,
"learning_rate": 4.56e-05,
"inf_nan_count": 0
},
{
"step": 7325,
"loss": 7.0265,
"learning_rate": 4.58e-05,
"inf_nan_count": 0
},
{
"step": 7350,
"loss": 7.0963,
"learning_rate": 4.59e-05,
"inf_nan_count": 0
},
{
"step": 7375,
"loss": 7.1138,
"learning_rate": 4.61e-05,
"inf_nan_count": 0
},
{
"step": 7400,
"loss": 7.0414,
"learning_rate": 4.63e-05,
"inf_nan_count": 0
},
{
"step": 7425,
"loss": 7.0753,
"learning_rate": 4.64e-05,
"inf_nan_count": 0
},
{
"step": 7450,
"loss": 7.0603,
"learning_rate": 4.66e-05,
"inf_nan_count": 0
},
{
"step": 7475,
"loss": 7.0818,
"learning_rate": 4.67e-05,
"inf_nan_count": 0
},
{
"step": 7500,
"loss": 7.0788,
"learning_rate": 4.69e-05,
"inf_nan_count": 0
},
{
"step": 7525,
"loss": 6.9952,
"learning_rate": 4.7e-05,
"inf_nan_count": 0
},
{
"step": 7550,
"loss": 7.0114,
"learning_rate": 4.72e-05,
"inf_nan_count": 0
},
{
"step": 7575,
"loss": 7.0611,
"learning_rate": 4.73e-05,
"inf_nan_count": 0
},
{
"step": 7600,
"loss": 7.0057,
"learning_rate": 4.75e-05,
"inf_nan_count": 0
},
{
"step": 7625,
"loss": 7.0182,
"learning_rate": 4.77e-05,
"inf_nan_count": 0
},
{
"step": 7650,
"loss": 7.0271,
"learning_rate": 4.78e-05,
"inf_nan_count": 0
},
{
"step": 7675,
"loss": 7.0817,
"learning_rate": 4.8e-05,
"inf_nan_count": 0
},
{
"step": 7700,
"loss": 7.0859,
"learning_rate": 4.81e-05,
"inf_nan_count": 0
},
{
"step": 7725,
"loss": 6.9859,
"learning_rate": 4.83e-05,
"inf_nan_count": 0
},
{
"step": 7750,
"loss": 7.038,
"learning_rate": 4.84e-05,
"inf_nan_count": 0
},
{
"step": 7775,
"loss": 6.9784,
"learning_rate": 4.86e-05,
"inf_nan_count": 0
},
{
"step": 7800,
"loss": 7.0304,
"learning_rate": 4.87e-05,
"inf_nan_count": 0
},
{
"step": 7825,
"loss": 7.0,
"learning_rate": 4.89e-05,
"inf_nan_count": 0
},
{
"step": 7850,
"loss": 7.0159,
"learning_rate": 4.91e-05,
"inf_nan_count": 0
},
{
"step": 7875,
"loss": 6.9859,
"learning_rate": 4.92e-05,
"inf_nan_count": 0
},
{
"step": 7900,
"loss": 6.9348,
"learning_rate": 4.94e-05,
"inf_nan_count": 0
},
{
"step": 7925,
"loss": 6.9541,
"learning_rate": 4.95e-05,
"inf_nan_count": 0
},
{
"step": 7950,
"loss": 6.9342,
"learning_rate": 4.97e-05,
"inf_nan_count": 0
},
{
"step": 7975,
"loss": 7.0294,
"learning_rate": 4.98e-05,
"inf_nan_count": 0
},
{
"step": 8000,
"loss": 7.0412,
"learning_rate": 5e-05,
"inf_nan_count": 0
},
{
"step": 8025,
"loss": 6.9111,
"learning_rate": 4.99e-05,
"inf_nan_count": 0
},
{
"step": 8050,
"loss": 7.0142,
"learning_rate": 4.98e-05,
"inf_nan_count": 0
},
{
"step": 8075,
"loss": 6.9201,
"learning_rate": 4.97e-05,
"inf_nan_count": 0
},
{
"step": 8100,
"loss": 6.91,
"learning_rate": 4.96e-05,
"inf_nan_count": 0
},
{
"step": 8125,
"loss": 6.9728,
"learning_rate": 4.95e-05,
"inf_nan_count": 0
},
{
"step": 8150,
"loss": 6.9963,
"learning_rate": 4.94e-05,
"inf_nan_count": 0
},
{
"step": 8175,
"loss": 7.0077,
"learning_rate": 4.93e-05,
"inf_nan_count": 0
},
{
"step": 8200,
"loss": 6.8808,
"learning_rate": 4.92e-05,
"inf_nan_count": 0
},
{
"step": 8225,
"loss": 6.85,
"learning_rate": 4.91e-05,
"inf_nan_count": 0
},
{
"step": 8250,
"loss": 6.9328,
"learning_rate": 4.9e-05,
"inf_nan_count": 0
},
{
"step": 8275,
"loss": 6.8971,
"learning_rate": 4.89e-05,
"inf_nan_count": 0
},
{
"step": 8300,
"loss": 6.9635,
"learning_rate": 4.87e-05,
"inf_nan_count": 0
},
{
"step": 8325,
"loss": 6.8937,
"learning_rate": 4.86e-05,
"inf_nan_count": 0
},
{
"step": 8350,
"loss": 6.8578,
"learning_rate": 4.85e-05,
"inf_nan_count": 0
},
{
"step": 8375,
"loss": 6.9492,
"learning_rate": 4.84e-05,
"inf_nan_count": 0
},
{
"step": 8400,
"loss": 6.8896,
"learning_rate": 4.83e-05,
"inf_nan_count": 0
},
{
"step": 8425,
"loss": 6.9677,
"learning_rate": 4.82e-05,
"inf_nan_count": 0
},
{
"step": 8450,
"loss": 6.9071,
"learning_rate": 4.81e-05,
"inf_nan_count": 0
},
{
"step": 8475,
"loss": 6.8973,
"learning_rate": 4.8e-05,
"inf_nan_count": 0
},
{
"step": 8500,
"loss": 6.9139,
"learning_rate": 4.79e-05,
"inf_nan_count": 0
},
{
"step": 8525,
"loss": 6.8983,
"learning_rate": 4.78e-05,
"inf_nan_count": 0
},
{
"step": 8550,
"loss": 6.8446,
"learning_rate": 4.77e-05,
"inf_nan_count": 0
},
{
"step": 8575,
"loss": 6.8246,
"learning_rate": 4.76e-05,
"inf_nan_count": 0
},
{
"step": 8600,
"loss": 6.9637,
"learning_rate": 4.75e-05,
"inf_nan_count": 0
},
{
"step": 8625,
"loss": 6.8827,
"learning_rate": 4.74e-05,
"inf_nan_count": 0
},
{
"step": 8650,
"loss": 6.8234,
"learning_rate": 4.73e-05,
"inf_nan_count": 0
},
{
"step": 8675,
"loss": 6.827,
"learning_rate": 4.72e-05,
"inf_nan_count": 0
},
{
"step": 8700,
"loss": 6.9554,
"learning_rate": 4.71e-05,
"inf_nan_count": 0
},
{
"step": 8725,
"loss": 6.8406,
"learning_rate": 4.7e-05,
"inf_nan_count": 0
},
{
"step": 8750,
"loss": 6.8328,
"learning_rate": 4.69e-05,
"inf_nan_count": 0
},
{
"step": 8775,
"loss": 6.8362,
"learning_rate": 4.68e-05,
"inf_nan_count": 0
},
{
"step": 8800,
"loss": 6.8417,
"learning_rate": 4.67e-05,
"inf_nan_count": 0
},
{
"step": 8825,
"loss": 6.8248,
"learning_rate": 4.66e-05,
"inf_nan_count": 0
},
{
"step": 8850,
"loss": 6.7996,
"learning_rate": 4.65e-05,
"inf_nan_count": 0
},
{
"step": 8875,
"loss": 6.7804,
"learning_rate": 4.64e-05,
"inf_nan_count": 0
},
{
"step": 8900,
"loss": 6.8802,
"learning_rate": 4.63e-05,
"inf_nan_count": 0
},
{
"step": 8925,
"loss": 6.8586,
"learning_rate": 4.61e-05,
"inf_nan_count": 0
},
{
"step": 8950,
"loss": 6.8489,
"learning_rate": 4.6e-05,
"inf_nan_count": 0
},
{
"step": 8975,
"loss": 6.8592,
"learning_rate": 4.59e-05,
"inf_nan_count": 0
},
{
"step": 9000,
"loss": 6.8302,
"learning_rate": 4.58e-05,
"inf_nan_count": 0
},
{
"step": 9025,
"loss": 6.831,
"learning_rate": 4.57e-05,
"inf_nan_count": 0
},
{
"step": 9050,
"loss": 6.7991,
"learning_rate": 4.56e-05,
"inf_nan_count": 0
},
{
"step": 9075,
"loss": 6.8311,
"learning_rate": 4.55e-05,
"inf_nan_count": 0
},
{
"step": 9100,
"loss": 6.7647,
"learning_rate": 4.54e-05,
"inf_nan_count": 0
},
{
"step": 9125,
"loss": 6.8225,
"learning_rate": 4.53e-05,
"inf_nan_count": 0
},
{
"step": 9150,
"loss": 6.7571,
"learning_rate": 4.52e-05,
"inf_nan_count": 0
},
{
"step": 9175,
"loss": 6.806,
"learning_rate": 4.51e-05,
"inf_nan_count": 0
},
{
"step": 9200,
"loss": 6.8348,
"learning_rate": 4.5e-05,
"inf_nan_count": 0
},
{
"step": 9225,
"loss": 6.9131,
"learning_rate": 4.49e-05,
"inf_nan_count": 0
},
{
"step": 9250,
"loss": 6.7801,
"learning_rate": 4.48e-05,
"inf_nan_count": 0
},
{
"step": 9275,
"loss": 6.7776,
"learning_rate": 4.47e-05,
"inf_nan_count": 0
},
{
"step": 9300,
"loss": 6.716,
"learning_rate": 4.46e-05,
"inf_nan_count": 0
},
{
"step": 9325,
"loss": 6.8958,
"learning_rate": 4.45e-05,
"inf_nan_count": 0
},
{
"step": 9350,
"loss": 6.8734,
"learning_rate": 4.44e-05,
"inf_nan_count": 0
},
{
"step": 9375,
"loss": 6.7203,
"learning_rate": 4.43e-05,
"inf_nan_count": 0
},
{
"step": 9400,
"loss": 6.7133,
"learning_rate": 4.42e-05,
"inf_nan_count": 0
},
{
"step": 9425,
"loss": 6.8392,
"learning_rate": 4.41e-05,
"inf_nan_count": 0
},
{
"step": 9450,
"loss": 6.7945,
"learning_rate": 4.4e-05,
"inf_nan_count": 0
},
{
"step": 9475,
"loss": 6.7831,
"learning_rate": 4.39e-05,
"inf_nan_count": 0
},
{
"step": 9500,
"loss": 6.7336,
"learning_rate": 4.37e-05,
"inf_nan_count": 0
},
{
"step": 9525,
"loss": 6.7529,
"learning_rate": 4.36e-05,
"inf_nan_count": 0
},
{
"step": 9550,
"loss": 6.6838,
"learning_rate": 4.35e-05,
"inf_nan_count": 0
},
{
"step": 9575,
"loss": 6.7548,
"learning_rate": 4.34e-05,
"inf_nan_count": 0
},
{
"step": 9600,
"loss": 6.8837,
"learning_rate": 4.33e-05,
"inf_nan_count": 0
},
{
"step": 9625,
"loss": 6.8271,
"learning_rate": 4.32e-05,
"inf_nan_count": 0
},
{
"step": 9650,
"loss": 6.7446,
"learning_rate": 4.31e-05,
"inf_nan_count": 0
},
{
"step": 9675,
"loss": 6.6811,
"learning_rate": 4.3e-05,
"inf_nan_count": 0
},
{
"step": 9700,
"loss": 6.7641,
"learning_rate": 4.29e-05,
"inf_nan_count": 0
},
{
"step": 9725,
"loss": 6.6779,
"learning_rate": 4.28e-05,
"inf_nan_count": 0
},
{
"step": 9750,
"loss": 6.7428,
"learning_rate": 4.27e-05,
"inf_nan_count": 0
},
{
"step": 9775,
"loss": 6.7698,
"learning_rate": 4.26e-05,
"inf_nan_count": 0
},
{
"step": 9800,
"loss": 6.7282,
"learning_rate": 4.25e-05,
"inf_nan_count": 0
},
{
"step": 9825,
"loss": 6.7314,
"learning_rate": 4.24e-05,
"inf_nan_count": 0
},
{
"step": 9850,
"loss": 6.7281,
"learning_rate": 4.23e-05,
"inf_nan_count": 0
},
{
"step": 9875,
"loss": 6.8553,
"learning_rate": 4.22e-05,
"inf_nan_count": 0
},
{
"step": 9900,
"loss": 6.7912,
"learning_rate": 4.21e-05,
"inf_nan_count": 0
},
{
"step": 9925,
"loss": 6.7301,
"learning_rate": 4.2e-05,
"inf_nan_count": 0
},
{
"step": 9950,
"loss": 6.7467,
"learning_rate": 4.19e-05,
"inf_nan_count": 0
},
{
"step": 9975,
"loss": 6.6581,
"learning_rate": 4.18e-05,
"inf_nan_count": 0
},
{
"step": 10000,
"loss": 6.7114,
"learning_rate": 4.17e-05,
"inf_nan_count": 0
},
{
"step": 10025,
"loss": 6.7754,
"learning_rate": 4.16e-05,
"inf_nan_count": 0
},
{
"step": 10050,
"loss": 6.695,
"learning_rate": 4.15e-05,
"inf_nan_count": 0
},
{
"step": 10075,
"loss": 6.6791,
"learning_rate": 4.14e-05,
"inf_nan_count": 0
},
{
"step": 10100,
"loss": 6.6957,
"learning_rate": 4.12e-05,
"inf_nan_count": 0
},
{
"step": 10125,
"loss": 6.7073,
"learning_rate": 4.11e-05,
"inf_nan_count": 0
},
{
"step": 10150,
"loss": 6.774,
"learning_rate": 4.1e-05,
"inf_nan_count": 0
},
{
"step": 10175,
"loss": 6.8045,
"learning_rate": 4.09e-05,
"inf_nan_count": 0
},
{
"step": 10200,
"loss": 6.761,
"learning_rate": 4.08e-05,
"inf_nan_count": 0
},
{
"step": 10225,
"loss": 6.6995,
"learning_rate": 4.07e-05,
"inf_nan_count": 0
},
{
"step": 10250,
"loss": 6.6779,
"learning_rate": 4.06e-05,
"inf_nan_count": 0
},
{
"step": 10275,
"loss": 6.7462,
"learning_rate": 4.05e-05,
"inf_nan_count": 0
},
{
"step": 10300,
"loss": 6.7099,
"learning_rate": 4.04e-05,
"inf_nan_count": 0
},
{
"step": 10325,
"loss": 6.7013,
"learning_rate": 4.03e-05,
"inf_nan_count": 0
},
{
"step": 10350,
"loss": 6.7173,
"learning_rate": 4.02e-05,
"inf_nan_count": 0
},
{
"step": 10375,
"loss": 6.6967,
"learning_rate": 4.01e-05,
"inf_nan_count": 0
},
{
"step": 10400,
"loss": 6.7565,
"learning_rate": 4e-05,
"inf_nan_count": 0
},
{
"step": 10425,
"loss": 6.7468,
"learning_rate": 3.99e-05,
"inf_nan_count": 0
},
{
"step": 10450,
"loss": 6.7132,
"learning_rate": 3.98e-05,
"inf_nan_count": 0
},
{
"step": 10475,
"loss": 6.6358,
"learning_rate": 3.97e-05,
"inf_nan_count": 0
},
{
"step": 10500,
"loss": 6.6979,
"learning_rate": 3.96e-05,
"inf_nan_count": 0
},
{
"step": 10525,
"loss": 6.6512,
"learning_rate": 3.95e-05,
"inf_nan_count": 0
},
{
"step": 10550,
"loss": 6.6045,
"learning_rate": 3.94e-05,
"inf_nan_count": 0
},
{
"step": 10575,
"loss": 6.6217,
"learning_rate": 3.93e-05,
"inf_nan_count": 0
},
{
"step": 10600,
"loss": 6.7091,
"learning_rate": 3.92e-05,
"inf_nan_count": 0
},
{
"step": 10625,
"loss": 6.618,
"learning_rate": 3.91e-05,
"inf_nan_count": 0
},
{
"step": 10650,
"loss": 6.6743,
"learning_rate": 3.9e-05,
"inf_nan_count": 0
},
{
"step": 10675,
"loss": 6.6481,
"learning_rate": 3.89e-05,
"inf_nan_count": 0
},
{
"step": 10700,
"loss": 6.6888,
"learning_rate": 3.87e-05,
"inf_nan_count": 0
},
{
"step": 10725,
"loss": 6.5786,
"learning_rate": 3.86e-05,
"inf_nan_count": 0
},
{
"step": 10750,
"loss": 6.6917,
"learning_rate": 3.85e-05,
"inf_nan_count": 0
},
{
"step": 10775,
"loss": 6.6487,
"learning_rate": 3.84e-05,
"inf_nan_count": 0
},
{
"step": 10800,
"loss": 6.7293,
"learning_rate": 3.83e-05,
"inf_nan_count": 0
},
{
"step": 10825,
"loss": 6.6369,
"learning_rate": 3.82e-05,
"inf_nan_count": 0
},
{
"step": 10850,
"loss": 6.7118,
"learning_rate": 3.81e-05,
"inf_nan_count": 0
},
{
"step": 10875,
"loss": 6.7235,
"learning_rate": 3.8e-05,
"inf_nan_count": 0
},
{
"step": 10900,
"loss": 6.6963,
"learning_rate": 3.79e-05,
"inf_nan_count": 0
},
{
"step": 10925,
"loss": 6.6791,
"learning_rate": 3.78e-05,
"inf_nan_count": 0
},
{
"step": 10950,
"loss": 6.6773,
"learning_rate": 3.77e-05,
"inf_nan_count": 0
},
{
"step": 10975,
"loss": 6.6819,
"learning_rate": 3.76e-05,
"inf_nan_count": 0
},
{
"step": 11000,
"loss": 6.6167,
"learning_rate": 3.75e-05,
"inf_nan_count": 0
},
{
"step": 11025,
"loss": 6.6727,
"learning_rate": 3.74e-05,
"inf_nan_count": 0
},
{
"step": 11050,
"loss": 6.6317,
"learning_rate": 3.73e-05,
"inf_nan_count": 0
},
{
"step": 11075,
"loss": 6.6432,
"learning_rate": 3.72e-05,
"inf_nan_count": 0
},
{
"step": 11100,
"loss": 6.6468,
"learning_rate": 3.71e-05,
"inf_nan_count": 0
},
{
"step": 11125,
"loss": 6.646,
"learning_rate": 3.7e-05,
"inf_nan_count": 0
},
{
"step": 11150,
"loss": 6.6852,
"learning_rate": 3.69e-05,
"inf_nan_count": 0
},
{
"step": 11175,
"loss": 6.5716,
"learning_rate": 3.68e-05,
"inf_nan_count": 0
},
{
"step": 11200,
"loss": 6.6311,
"learning_rate": 3.67e-05,
"inf_nan_count": 0
},
{
"step": 11225,
"loss": 6.648,
"learning_rate": 3.66e-05,
"inf_nan_count": 0
},
{
"step": 11250,
"loss": 6.6204,
"learning_rate": 3.65e-05,
"inf_nan_count": 0
},
{
"step": 11275,
"loss": 6.6551,
"learning_rate": 3.64e-05,
"inf_nan_count": 0
},
{
"step": 11300,
"loss": 6.6013,
"learning_rate": 3.63e-05,
"inf_nan_count": 0
},
{
"step": 11325,
"loss": 6.6478,
"learning_rate": 3.61e-05,
"inf_nan_count": 0
},
{
"step": 11350,
"loss": 6.6938,
"learning_rate": 3.6e-05,
"inf_nan_count": 0
},
{
"step": 11375,
"loss": 6.6124,
"learning_rate": 3.59e-05,
"inf_nan_count": 0
},
{
"step": 11400,
"loss": 6.6781,
"learning_rate": 3.58e-05,
"inf_nan_count": 0
},
{
"step": 11425,
"loss": 6.6317,
"learning_rate": 3.57e-05,
"inf_nan_count": 0
},
{
"step": 11450,
"loss": 6.6195,
"learning_rate": 3.56e-05,
"inf_nan_count": 0
},
{
"step": 11475,
"loss": 6.5941,
"learning_rate": 3.55e-05,
"inf_nan_count": 0
},
{
"step": 11500,
"loss": 6.5808,
"learning_rate": 3.54e-05,
"inf_nan_count": 0
},
{
"step": 11525,
"loss": 6.6322,
"learning_rate": 3.53e-05,
"inf_nan_count": 0
},
{
"step": 11550,
"loss": 6.6172,
"learning_rate": 3.52e-05,
"inf_nan_count": 0
},
{
"step": 11575,
"loss": 6.649,
"learning_rate": 3.51e-05,
"inf_nan_count": 0
},
{
"step": 11600,
"loss": 6.605,
"learning_rate": 3.5e-05,
"inf_nan_count": 0
},
{
"step": 11625,
"loss": 6.6184,
"learning_rate": 3.49e-05,
"inf_nan_count": 0
},
{
"step": 11650,
"loss": 6.5597,
"learning_rate": 3.48e-05,
"inf_nan_count": 0
},
{
"step": 11675,
"loss": 6.6285,
"learning_rate": 3.47e-05,
"inf_nan_count": 0
},
{
"step": 11700,
"loss": 6.5209,
"learning_rate": 3.46e-05,
"inf_nan_count": 0
},
{
"step": 11725,
"loss": 6.5505,
"learning_rate": 3.45e-05,
"inf_nan_count": 0
},
{
"step": 11750,
"loss": 6.671,
"learning_rate": 3.44e-05,
"inf_nan_count": 0
},
{
"step": 11775,
"loss": 6.6403,
"learning_rate": 3.43e-05,
"inf_nan_count": 0
},
{
"step": 11800,
"loss": 6.5738,
"learning_rate": 3.42e-05,
"inf_nan_count": 0
},
{
"step": 11825,
"loss": 6.608,
"learning_rate": 3.41e-05,
"inf_nan_count": 0
},
{
"step": 11850,
"loss": 6.6406,
"learning_rate": 3.4e-05,
"inf_nan_count": 0
},
{
"step": 11875,
"loss": 6.6299,
"learning_rate": 3.39e-05,
"inf_nan_count": 0
},
{
"step": 11900,
"loss": 6.5781,
"learning_rate": 3.38e-05,
"inf_nan_count": 0
},
{
"step": 11925,
"loss": 6.5003,
"learning_rate": 3.36e-05,
"inf_nan_count": 0
},
{
"step": 11950,
"loss": 6.635,
"learning_rate": 3.35e-05,
"inf_nan_count": 0
},
{
"step": 11975,
"loss": 6.618,
"learning_rate": 3.34e-05,
"inf_nan_count": 0
},
{
"step": 12000,
"loss": 6.6603,
"learning_rate": 3.33e-05,
"inf_nan_count": 0
},
{
"step": 12025,
"loss": 6.5507,
"learning_rate": 3.32e-05,
"inf_nan_count": 0
},
{
"step": 12050,
"loss": 6.5878,
"learning_rate": 3.31e-05,
"inf_nan_count": 0
},
{
"step": 12075,
"loss": 6.5245,
"learning_rate": 3.3e-05,
"inf_nan_count": 0
},
{
"step": 12100,
"loss": 6.5629,
"learning_rate": 3.29e-05,
"inf_nan_count": 0
},
{
"step": 12125,
"loss": 6.6181,
"learning_rate": 3.28e-05,
"inf_nan_count": 0
},
{
"step": 12150,
"loss": 6.578,
"learning_rate": 3.27e-05,
"inf_nan_count": 0
},
{
"step": 12175,
"loss": 6.5753,
"learning_rate": 3.26e-05,
"inf_nan_count": 0
},
{
"step": 12200,
"loss": 6.6071,
"learning_rate": 3.25e-05,
"inf_nan_count": 0
},
{
"step": 12225,
"loss": 6.5885,
"learning_rate": 3.24e-05,
"inf_nan_count": 0
},
{
"step": 12250,
"loss": 6.5413,
"learning_rate": 3.23e-05,
"inf_nan_count": 0
},
{
"step": 12275,
"loss": 6.6635,
"learning_rate": 3.22e-05,
"inf_nan_count": 0
},
{
"step": 12300,
"loss": 6.6304,
"learning_rate": 3.21e-05,
"inf_nan_count": 0
},
{
"step": 12325,
"loss": 6.5078,
"learning_rate": 3.2e-05,
"inf_nan_count": 0
},
{
"step": 12350,
"loss": 6.5712,
"learning_rate": 3.19e-05,
"inf_nan_count": 0
},
{
"step": 12375,
"loss": 6.6284,
"learning_rate": 3.18e-05,
"inf_nan_count": 0
},
{
"step": 12400,
"loss": 6.5837,
"learning_rate": 3.17e-05,
"inf_nan_count": 0
},
{
"step": 12425,
"loss": 6.5354,
"learning_rate": 3.16e-05,
"inf_nan_count": 0
},
{
"step": 12450,
"loss": 6.6125,
"learning_rate": 3.15e-05,
"inf_nan_count": 0
},
{
"step": 12475,
"loss": 6.5477,
"learning_rate": 3.14e-05,
"inf_nan_count": 0
},
{
"step": 12500,
"loss": 6.5827,
"learning_rate": 3.13e-05,
"inf_nan_count": 0
},
{
"step": 12525,
"loss": 6.5874,
"learning_rate": 3.11e-05,
"inf_nan_count": 0
},
{
"step": 12550,
"loss": 6.5437,
"learning_rate": 3.1e-05,
"inf_nan_count": 0
},
{
"step": 12575,
"loss": 6.582,
"learning_rate": 3.09e-05,
"inf_nan_count": 0
},
{
"step": 12600,
"loss": 6.5286,
"learning_rate": 3.08e-05,
"inf_nan_count": 0
},
{
"step": 12625,
"loss": 6.5144,
"learning_rate": 3.07e-05,
"inf_nan_count": 0
},
{
"step": 12650,
"loss": 6.5327,
"learning_rate": 3.06e-05,
"inf_nan_count": 0
},
{
"step": 12675,
"loss": 6.6058,
"learning_rate": 3.05e-05,
"inf_nan_count": 0
},
{
"step": 12700,
"loss": 6.5626,
"learning_rate": 3.04e-05,
"inf_nan_count": 0
},
{
"step": 12725,
"loss": 6.4589,
"learning_rate": 3.03e-05,
"inf_nan_count": 0
},
{
"step": 12750,
"loss": 6.5629,
"learning_rate": 3.02e-05,
"inf_nan_count": 0
},
{
"step": 12775,
"loss": 6.4815,
"learning_rate": 3.01e-05,
"inf_nan_count": 0
},
{
"step": 12800,
"loss": 6.5651,
"learning_rate": 3e-05,
"inf_nan_count": 0
},
{
"step": 12825,
"loss": 6.6164,
"learning_rate": 2.99e-05,
"inf_nan_count": 0
},
{
"step": 12850,
"loss": 6.6102,
"learning_rate": 2.98e-05,
"inf_nan_count": 0
},
{
"step": 12875,
"loss": 6.4871,
"learning_rate": 2.97e-05,
"inf_nan_count": 0
},
{
"step": 12900,
"loss": 6.49,
"learning_rate": 2.96e-05,
"inf_nan_count": 0
},
{
"step": 12925,
"loss": 6.6028,
"learning_rate": 2.95e-05,
"inf_nan_count": 0
},
{
"step": 12950,
"loss": 6.5509,
"learning_rate": 2.94e-05,
"inf_nan_count": 0
},
{
"step": 12975,
"loss": 6.5454,
"learning_rate": 2.93e-05,
"inf_nan_count": 0
},
{
"step": 13000,
"loss": 6.5587,
"learning_rate": 2.92e-05,
"inf_nan_count": 0
},
{
"step": 13025,
"loss": 6.5862,
"learning_rate": 2.91e-05,
"inf_nan_count": 0
},
{
"step": 13050,
"loss": 6.5668,
"learning_rate": 2.9e-05,
"inf_nan_count": 0
},
{
"step": 13075,
"loss": 6.522,
"learning_rate": 2.89e-05,
"inf_nan_count": 0
},
{
"step": 13100,
"loss": 6.5044,
"learning_rate": 2.87e-05,
"inf_nan_count": 0
},
{
"step": 13125,
"loss": 6.6356,
"learning_rate": 2.86e-05,
"inf_nan_count": 0
},
{
"step": 13150,
"loss": 6.4772,
"learning_rate": 2.85e-05,
"inf_nan_count": 0
},
{
"step": 13175,
"loss": 6.5504,
"learning_rate": 2.84e-05,
"inf_nan_count": 0
},
{
"step": 13200,
"loss": 6.5415,
"learning_rate": 2.83e-05,
"inf_nan_count": 0
},
{
"step": 13225,
"loss": 6.4651,
"learning_rate": 2.82e-05,
"inf_nan_count": 0
},
{
"step": 13250,
"loss": 6.5536,
"learning_rate": 2.81e-05,
"inf_nan_count": 0
},
{
"step": 13275,
"loss": 6.4861,
"learning_rate": 2.8e-05,
"inf_nan_count": 0
},
{
"step": 13300,
"loss": 6.4688,
"learning_rate": 2.79e-05,
"inf_nan_count": 0
},
{
"step": 13325,
"loss": 6.5549,
"learning_rate": 2.78e-05,
"inf_nan_count": 0
},
{
"step": 13350,
"loss": 6.4589,
"learning_rate": 2.77e-05,
"inf_nan_count": 0
},
{
"step": 13375,
"loss": 6.4644,
"learning_rate": 2.76e-05,
"inf_nan_count": 0
},
{
"step": 13400,
"loss": 6.5937,
"learning_rate": 2.75e-05,
"inf_nan_count": 0
},
{
"step": 13425,
"loss": 6.5798,
"learning_rate": 2.74e-05,
"inf_nan_count": 0
},
{
"step": 13450,
"loss": 6.4615,
"learning_rate": 2.73e-05,
"inf_nan_count": 0
},
{
"step": 13475,
"loss": 6.5173,
"learning_rate": 2.72e-05,
"inf_nan_count": 0
},
{
"step": 13500,
"loss": 6.4795,
"learning_rate": 2.71e-05,
"inf_nan_count": 0
},
{
"step": 13525,
"loss": 6.4789,
"learning_rate": 2.7e-05,
"inf_nan_count": 0
},
{
"step": 13550,
"loss": 6.4835,
"learning_rate": 2.69e-05,
"inf_nan_count": 0
},
{
"step": 13575,
"loss": 6.5405,
"learning_rate": 2.68e-05,
"inf_nan_count": 0
},
{
"step": 13600,
"loss": 6.4616,
"learning_rate": 2.67e-05,
"inf_nan_count": 0
},
{
"step": 13625,
"loss": 6.4578,
"learning_rate": 2.66e-05,
"inf_nan_count": 0
},
{
"step": 13650,
"loss": 6.4083,
"learning_rate": 2.65e-05,
"inf_nan_count": 0
},
{
"step": 13675,
"loss": 6.561,
"learning_rate": 2.64e-05,
"inf_nan_count": 0
},
{
"step": 13700,
"loss": 6.5432,
"learning_rate": 2.63e-05,
"inf_nan_count": 0
},
{
"step": 13725,
"loss": 6.5119,
"learning_rate": 2.61e-05,
"inf_nan_count": 0
},
{
"step": 13750,
"loss": 6.454,
"learning_rate": 2.6e-05,
"inf_nan_count": 0
},
{
"step": 13775,
"loss": 6.44,
"learning_rate": 2.59e-05,
"inf_nan_count": 0
},
{
"step": 13800,
"loss": 6.4767,
"learning_rate": 2.58e-05,
"inf_nan_count": 0
},
{
"step": 13825,
"loss": 6.4765,
"learning_rate": 2.57e-05,
"inf_nan_count": 0
},
{
"step": 13850,
"loss": 6.5018,
"learning_rate": 2.56e-05,
"inf_nan_count": 0
},
{
"step": 13875,
"loss": 6.5011,
"learning_rate": 2.55e-05,
"inf_nan_count": 0
},
{
"step": 13900,
"loss": 6.4283,
"learning_rate": 2.54e-05,
"inf_nan_count": 0
},
{
"step": 13925,
"loss": 6.519,
"learning_rate": 2.53e-05,
"inf_nan_count": 0
},
{
"step": 13950,
"loss": 6.4388,
"learning_rate": 2.52e-05,
"inf_nan_count": 0
},
{
"step": 13975,
"loss": 6.455,
"learning_rate": 2.51e-05,
"inf_nan_count": 0
},
{
"step": 14000,
"loss": 6.3491,
"learning_rate": 2.5e-05,
"inf_nan_count": 0
},
{
"step": 14025,
"loss": 6.5285,
"learning_rate": 2.49e-05,
"inf_nan_count": 0
},
{
"step": 14050,
"loss": 6.5082,
"learning_rate": 2.48e-05,
"inf_nan_count": 0
},
{
"step": 14075,
"loss": 6.5451,
"learning_rate": 2.47e-05,
"inf_nan_count": 0
},
{
"step": 14100,
"loss": 6.4753,
"learning_rate": 2.46e-05,
"inf_nan_count": 0
},
{
"step": 14125,
"loss": 6.6011,
"learning_rate": 2.45e-05,
"inf_nan_count": 0
},
{
"step": 14150,
"loss": 6.4885,
"learning_rate": 2.44e-05,
"inf_nan_count": 0
},
{
"step": 14175,
"loss": 6.4635,
"learning_rate": 2.43e-05,
"inf_nan_count": 0
},
{
"step": 14200,
"loss": 6.5519,
"learning_rate": 2.42e-05,
"inf_nan_count": 0
},
{
"step": 14225,
"loss": 6.4356,
"learning_rate": 2.41e-05,
"inf_nan_count": 0
},
{
"step": 14250,
"loss": 6.4552,
"learning_rate": 2.4e-05,
"inf_nan_count": 0
},
{
"step": 14275,
"loss": 6.4613,
"learning_rate": 2.39e-05,
"inf_nan_count": 0
},
{
"step": 14300,
"loss": 6.4411,
"learning_rate": 2.38e-05,
"inf_nan_count": 0
},
{
"step": 14325,
"loss": 6.557,
"learning_rate": 2.36e-05,
"inf_nan_count": 0
},
{
"step": 14350,
"loss": 6.4476,
"learning_rate": 2.35e-05,
"inf_nan_count": 0
},
{
"step": 14375,
"loss": 6.5895,
"learning_rate": 2.34e-05,
"inf_nan_count": 0
},
{
"step": 14400,
"loss": 6.4836,
"learning_rate": 2.33e-05,
"inf_nan_count": 0
},
{
"step": 14425,
"loss": 6.4175,
"learning_rate": 2.32e-05,
"inf_nan_count": 0
},
{
"step": 14450,
"loss": 6.4971,
"learning_rate": 2.31e-05,
"inf_nan_count": 0
},
{
"step": 14475,
"loss": 6.4897,
"learning_rate": 2.3e-05,
"inf_nan_count": 0
},
{
"step": 14500,
"loss": 6.455,
"learning_rate": 2.29e-05,
"inf_nan_count": 0
},
{
"step": 14525,
"loss": 6.4688,
"learning_rate": 2.28e-05,
"inf_nan_count": 0
},
{
"step": 14550,
"loss": 6.5494,
"learning_rate": 2.27e-05,
"inf_nan_count": 0
},
{
"step": 14575,
"loss": 6.4501,
"learning_rate": 2.26e-05,
"inf_nan_count": 0
},
{
"step": 14600,
"loss": 6.5142,
"learning_rate": 2.25e-05,
"inf_nan_count": 0
},
{
"step": 14625,
"loss": 6.4891,
"learning_rate": 2.24e-05,
"inf_nan_count": 0
},
{
"step": 14650,
"loss": 6.4274,
"learning_rate": 2.23e-05,
"inf_nan_count": 0
},
{
"step": 14675,
"loss": 6.5277,
"learning_rate": 2.22e-05,
"inf_nan_count": 0
},
{
"step": 14700,
"loss": 6.4472,
"learning_rate": 2.21e-05,
"inf_nan_count": 0
},
{
"step": 14725,
"loss": 6.4328,
"learning_rate": 2.2e-05,
"inf_nan_count": 0
},
{
"step": 14750,
"loss": 6.4928,
"learning_rate": 2.19e-05,
"inf_nan_count": 0
},
{
"step": 14775,
"loss": 6.552,
"learning_rate": 2.18e-05,
"inf_nan_count": 0
},
{
"step": 14800,
"loss": 6.5474,
"learning_rate": 2.17e-05,
"inf_nan_count": 0
},
{
"step": 14825,
"loss": 6.4394,
"learning_rate": 2.16e-05,
"inf_nan_count": 0
},
{
"step": 14850,
"loss": 6.5234,
"learning_rate": 2.15e-05,
"inf_nan_count": 0
},
{
"step": 14875,
"loss": 6.4369,
"learning_rate": 2.14e-05,
"inf_nan_count": 0
},
{
"step": 14900,
"loss": 6.4694,
"learning_rate": 2.13e-05,
"inf_nan_count": 0
},
{
"step": 14925,
"loss": 6.5837,
"learning_rate": 2.11e-05,
"inf_nan_count": 0
},
{
"step": 14950,
"loss": 6.4841,
"learning_rate": 2.1e-05,
"inf_nan_count": 0
},
{
"step": 14975,
"loss": 6.4347,
"learning_rate": 2.09e-05,
"inf_nan_count": 0
},
{
"step": 15000,
"loss": 6.5816,
"learning_rate": 2.08e-05,
"inf_nan_count": 0
},
{
"step": 15025,
"loss": 6.5337,
"learning_rate": 2.07e-05,
"inf_nan_count": 0
},
{
"step": 15050,
"loss": 6.5131,
"learning_rate": 2.06e-05,
"inf_nan_count": 0
},
{
"step": 15075,
"loss": 6.4669,
"learning_rate": 2.05e-05,
"inf_nan_count": 0
},
{
"step": 15100,
"loss": 6.5141,
"learning_rate": 2.04e-05,
"inf_nan_count": 0
},
{
"step": 15125,
"loss": 6.438,
"learning_rate": 2.03e-05,
"inf_nan_count": 0
},
{
"step": 15150,
"loss": 6.4036,
"learning_rate": 2.02e-05,
"inf_nan_count": 0
},
{
"step": 15175,
"loss": 6.4517,
"learning_rate": 2.01e-05,
"inf_nan_count": 0
},
{
"step": 15200,
"loss": 6.477,
"learning_rate": 2e-05,
"inf_nan_count": 0
},
{
"step": 15225,
"loss": 6.4317,
"learning_rate": 1.99e-05,
"inf_nan_count": 0
},
{
"step": 15250,
"loss": 6.488,
"learning_rate": 1.98e-05,
"inf_nan_count": 0
},
{
"step": 15275,
"loss": 6.4466,
"learning_rate": 1.97e-05,
"inf_nan_count": 0
},
{
"step": 15300,
"loss": 6.4248,
"learning_rate": 1.96e-05,
"inf_nan_count": 0
},
{
"step": 15325,
"loss": 6.3834,
"learning_rate": 1.95e-05,
"inf_nan_count": 0
},
{
"step": 15350,
"loss": 6.4272,
"learning_rate": 1.94e-05,
"inf_nan_count": 0
},
{
"step": 15375,
"loss": 6.4834,
"learning_rate": 1.93e-05,
"inf_nan_count": 0
},
{
"step": 15400,
"loss": 6.405,
"learning_rate": 1.92e-05,
"inf_nan_count": 0
},
{
"step": 15425,
"loss": 6.4264,
"learning_rate": 1.91e-05,
"inf_nan_count": 0
},
{
"step": 15450,
"loss": 6.4941,
"learning_rate": 1.9e-05,
"inf_nan_count": 0
},
{
"step": 15475,
"loss": 6.4755,
"learning_rate": 1.89e-05,
"inf_nan_count": 0
},
{
"step": 15500,
"loss": 6.5459,
"learning_rate": 1.88e-05,
"inf_nan_count": 0
},
{
"step": 15525,
"loss": 6.3772,
"learning_rate": 1.86e-05,
"inf_nan_count": 0
},
{
"step": 15550,
"loss": 6.443,
"learning_rate": 1.85e-05,
"inf_nan_count": 0
},
{
"step": 15575,
"loss": 6.3931,
"learning_rate": 1.84e-05,
"inf_nan_count": 0
},
{
"step": 15600,
"loss": 6.4087,
"learning_rate": 1.83e-05,
"inf_nan_count": 0
},
{
"step": 15625,
"loss": 6.4743,
"learning_rate": 1.82e-05,
"inf_nan_count": 0
},
{
"step": 15650,
"loss": 6.4575,
"learning_rate": 1.81e-05,
"inf_nan_count": 0
},
{
"step": 15675,
"loss": 6.4971,
"learning_rate": 1.8e-05,
"inf_nan_count": 0
},
{
"step": 15700,
"loss": 6.438,
"learning_rate": 1.79e-05,
"inf_nan_count": 0
},
{
"step": 15725,
"loss": 6.5071,
"learning_rate": 1.78e-05,
"inf_nan_count": 0
},
{
"step": 15750,
"loss": 6.391,
"learning_rate": 1.77e-05,
"inf_nan_count": 0
},
{
"step": 15775,
"loss": 6.4386,
"learning_rate": 1.76e-05,
"inf_nan_count": 0
},
{
"step": 15800,
"loss": 6.4268,
"learning_rate": 1.75e-05,
"inf_nan_count": 0
},
{
"step": 15825,
"loss": 6.5534,
"learning_rate": 1.74e-05,
"inf_nan_count": 0
},
{
"step": 15850,
"loss": 6.4422,
"learning_rate": 1.73e-05,
"inf_nan_count": 0
},
{
"step": 15875,
"loss": 6.4075,
"learning_rate": 1.72e-05,
"inf_nan_count": 0
},
{
"step": 15900,
"loss": 6.4458,
"learning_rate": 1.71e-05,
"inf_nan_count": 0
},
{
"step": 15925,
"loss": 6.3855,
"learning_rate": 1.7e-05,
"inf_nan_count": 0
},
{
"step": 15950,
"loss": 6.3659,
"learning_rate": 1.69e-05,
"inf_nan_count": 0
},
{
"step": 15975,
"loss": 6.5396,
"learning_rate": 1.68e-05,
"inf_nan_count": 0
},
{
"step": 16000,
"loss": 6.4974,
"learning_rate": 1.67e-05,
"inf_nan_count": 0
},
{
"step": 16025,
"loss": 6.4785,
"learning_rate": 1.66e-05,
"inf_nan_count": 0
},
{
"step": 16050,
"loss": 6.4341,
"learning_rate": 1.65e-05,
"inf_nan_count": 0
},
{
"step": 16075,
"loss": 6.3709,
"learning_rate": 1.64e-05,
"inf_nan_count": 0
},
{
"step": 16100,
"loss": 6.3707,
"learning_rate": 1.63e-05,
"inf_nan_count": 0
},
{
"step": 16125,
"loss": 6.4206,
"learning_rate": 1.61e-05,
"inf_nan_count": 0
},
{
"step": 16150,
"loss": 6.397,
"learning_rate": 1.6e-05,
"inf_nan_count": 0
},
{
"step": 16175,
"loss": 6.4617,
"learning_rate": 1.59e-05,
"inf_nan_count": 0
},
{
"step": 16200,
"loss": 6.5586,
"learning_rate": 1.58e-05,
"inf_nan_count": 0
},
{
"step": 16225,
"loss": 6.4248,
"learning_rate": 1.57e-05,
"inf_nan_count": 0
},
{
"step": 16250,
"loss": 6.4204,
"learning_rate": 1.56e-05,
"inf_nan_count": 0
},
{
"step": 16275,
"loss": 6.4632,
"learning_rate": 1.55e-05,
"inf_nan_count": 0
},
{
"step": 16300,
"loss": 6.4491,
"learning_rate": 1.54e-05,
"inf_nan_count": 0
},
{
"step": 16325,
"loss": 6.4412,
"learning_rate": 1.53e-05,
"inf_nan_count": 0
},
{
"step": 16350,
"loss": 6.4144,
"learning_rate": 1.52e-05,
"inf_nan_count": 0
},
{
"step": 16375,
"loss": 6.466,
"learning_rate": 1.51e-05,
"inf_nan_count": 0
},
{
"step": 16400,
"loss": 6.4246,
"learning_rate": 1.5e-05,
"inf_nan_count": 0
},
{
"step": 16425,
"loss": 6.4571,
"learning_rate": 1.49e-05,
"inf_nan_count": 0
},
{
"step": 16450,
"loss": 6.3903,
"learning_rate": 1.48e-05,
"inf_nan_count": 0
},
{
"step": 16475,
"loss": 6.4141,
"learning_rate": 1.47e-05,
"inf_nan_count": 0
},
{
"step": 16500,
"loss": 6.4467,
"learning_rate": 1.46e-05,
"inf_nan_count": 0
},
{
"step": 16525,
"loss": 6.356,
"learning_rate": 1.45e-05,
"inf_nan_count": 0
},
{
"step": 16550,
"loss": 6.4049,
"learning_rate": 1.44e-05,
"inf_nan_count": 0
},
{
"step": 16575,
"loss": 6.4103,
"learning_rate": 1.43e-05,
"inf_nan_count": 0
},
{
"step": 16600,
"loss": 6.4282,
"learning_rate": 1.42e-05,
"inf_nan_count": 0
},
{
"step": 16625,
"loss": 6.5397,
"learning_rate": 1.41e-05,
"inf_nan_count": 0
},
{
"step": 16650,
"loss": 6.3862,
"learning_rate": 1.4e-05,
"inf_nan_count": 0
},
{
"step": 16675,
"loss": 6.4291,
"learning_rate": 1.39e-05,
"inf_nan_count": 0
},
{
"step": 16700,
"loss": 6.433,
"learning_rate": 1.38e-05,
"inf_nan_count": 0
},
{
"step": 16725,
"loss": 6.3934,
"learning_rate": 1.36e-05,
"inf_nan_count": 0
},
{
"step": 16750,
"loss": 6.4042,
"learning_rate": 1.35e-05,
"inf_nan_count": 0
},
{
"step": 16775,
"loss": 6.4187,
"learning_rate": 1.34e-05,
"inf_nan_count": 0
},
{
"step": 16800,
"loss": 6.4455,
"learning_rate": 1.33e-05,
"inf_nan_count": 0
},
{
"step": 16825,
"loss": 6.424,
"learning_rate": 1.32e-05,
"inf_nan_count": 0
},
{
"step": 16850,
"loss": 6.4491,
"learning_rate": 1.31e-05,
"inf_nan_count": 0
},
{
"step": 16875,
"loss": 6.3993,
"learning_rate": 1.3e-05,
"inf_nan_count": 0
},
{
"step": 16900,
"loss": 6.4393,
"learning_rate": 1.29e-05,
"inf_nan_count": 0
},
{
"step": 16925,
"loss": 6.3705,
"learning_rate": 1.28e-05,
"inf_nan_count": 0
},
{
"step": 16950,
"loss": 6.4404,
"learning_rate": 1.27e-05,
"inf_nan_count": 0
},
{
"step": 16975,
"loss": 6.4507,
"learning_rate": 1.26e-05,
"inf_nan_count": 0
},
{
"step": 17000,
"loss": 6.3821,
"learning_rate": 1.25e-05,
"inf_nan_count": 0
},
{
"step": 17025,
"loss": 6.4234,
"learning_rate": 1.24e-05,
"inf_nan_count": 0
},
{
"step": 17050,
"loss": 6.4235,
"learning_rate": 1.23e-05,
"inf_nan_count": 0
},
{
"step": 17075,
"loss": 6.4856,
"learning_rate": 1.22e-05,
"inf_nan_count": 0
},
{
"step": 17100,
"loss": 6.4877,
"learning_rate": 1.21e-05,
"inf_nan_count": 0
},
{
"step": 17125,
"loss": 6.3683,
"learning_rate": 1.2e-05,
"inf_nan_count": 0
},
{
"step": 17150,
"loss": 6.4225,
"learning_rate": 1.19e-05,
"inf_nan_count": 0
},
{
"step": 17175,
"loss": 6.2573,
"learning_rate": 1.18e-05,
"inf_nan_count": 0
},
{
"step": 17200,
"loss": 6.3946,
"learning_rate": 1.17e-05,
"inf_nan_count": 0
},
{
"step": 17225,
"loss": 6.4607,
"learning_rate": 1.16e-05,
"inf_nan_count": 0
},
{
"step": 17250,
"loss": 6.4407,
"learning_rate": 1.15e-05,
"inf_nan_count": 0
},
{
"step": 17275,
"loss": 6.4333,
"learning_rate": 1.14e-05,
"inf_nan_count": 0
},
{
"step": 17300,
"loss": 6.3782,
"learning_rate": 1.13e-05,
"inf_nan_count": 0
},
{
"step": 17325,
"loss": 6.3665,
"learning_rate": 1.11e-05,
"inf_nan_count": 0
},
{
"step": 17350,
"loss": 6.4329,
"learning_rate": 1.1e-05,
"inf_nan_count": 0
},
{
"step": 17375,
"loss": 6.5107,
"learning_rate": 1.09e-05,
"inf_nan_count": 0
},
{
"step": 17400,
"loss": 6.5076,
"learning_rate": 1.08e-05,
"inf_nan_count": 0
},
{
"step": 17425,
"loss": 6.4936,
"learning_rate": 1.07e-05,
"inf_nan_count": 0
},
{
"step": 17450,
"loss": 6.4119,
"learning_rate": 1.06e-05,
"inf_nan_count": 0
},
{
"step": 17475,
"loss": 6.4032,
"learning_rate": 1.05e-05,
"inf_nan_count": 0
},
{
"step": 17500,
"loss": 6.3962,
"learning_rate": 1.04e-05,
"inf_nan_count": 0
},
{
"step": 17525,
"loss": 6.4288,
"learning_rate": 1.03e-05,
"inf_nan_count": 0
},
{
"step": 17550,
"loss": 6.4021,
"learning_rate": 1.02e-05,
"inf_nan_count": 0
},
{
"step": 17575,
"loss": 6.367,
"learning_rate": 1.01e-05,
"inf_nan_count": 0
},
{
"step": 17600,
"loss": 6.3904,
"learning_rate": 1e-05,
"inf_nan_count": 0
},
{
"step": 17625,
"loss": 6.5059,
"learning_rate": 9.9e-06,
"inf_nan_count": 0
},
{
"step": 17650,
"loss": 6.4225,
"learning_rate": 9.79e-06,
"inf_nan_count": 0
},
{
"step": 17675,
"loss": 6.4422,
"learning_rate": 9.69e-06,
"inf_nan_count": 0
},
{
"step": 17700,
"loss": 6.457,
"learning_rate": 9.58e-06,
"inf_nan_count": 0
},
{
"step": 17725,
"loss": 6.4475,
"learning_rate": 9.48e-06,
"inf_nan_count": 0
},
{
"step": 17750,
"loss": 6.3786,
"learning_rate": 9.38e-06,
"inf_nan_count": 0
},
{
"step": 17775,
"loss": 6.4145,
"learning_rate": 9.27e-06,
"inf_nan_count": 0
},
{
"step": 17800,
"loss": 6.3543,
"learning_rate": 9.17e-06,
"inf_nan_count": 0
},
{
"step": 17825,
"loss": 6.5116,
"learning_rate": 9.06e-06,
"inf_nan_count": 0
},
{
"step": 17850,
"loss": 6.4101,
"learning_rate": 8.96e-06,
"inf_nan_count": 0
},
{
"step": 17875,
"loss": 6.4014,
"learning_rate": 8.85e-06,
"inf_nan_count": 0
},
{
"step": 17900,
"loss": 6.4216,
"learning_rate": 8.75e-06,
"inf_nan_count": 0
},
{
"step": 17925,
"loss": 6.4539,
"learning_rate": 8.65e-06,
"inf_nan_count": 0
},
{
"step": 17950,
"loss": 6.4205,
"learning_rate": 8.54e-06,
"inf_nan_count": 0
},
{
"step": 17975,
"loss": 6.3865,
"learning_rate": 8.44e-06,
"inf_nan_count": 0
},
{
"step": 18000,
"loss": 6.4347,
"learning_rate": 8.33e-06,
"inf_nan_count": 0
},
{
"step": 18025,
"loss": 6.4313,
"learning_rate": 8.23e-06,
"inf_nan_count": 0
},
{
"step": 18050,
"loss": 6.3868,
"learning_rate": 8.13e-06,
"inf_nan_count": 0
},
{
"step": 18075,
"loss": 6.3703,
"learning_rate": 8.02e-06,
"inf_nan_count": 0
},
{
"step": 18100,
"loss": 6.3747,
"learning_rate": 7.92e-06,
"inf_nan_count": 0
},
{
"step": 18125,
"loss": 6.4228,
"learning_rate": 7.81e-06,
"inf_nan_count": 0
},
{
"step": 18150,
"loss": 6.349,
"learning_rate": 7.71e-06,
"inf_nan_count": 0
},
{
"step": 18175,
"loss": 6.4522,
"learning_rate": 7.6e-06,
"inf_nan_count": 0
},
{
"step": 18200,
"loss": 6.3354,
"learning_rate": 7.5e-06,
"inf_nan_count": 0
},
{
"step": 18225,
"loss": 6.4663,
"learning_rate": 7.4e-06,
"inf_nan_count": 0
},
{
"step": 18250,
"loss": 6.4155,
"learning_rate": 7.29e-06,
"inf_nan_count": 0
},
{
"step": 18275,
"loss": 6.4584,
"learning_rate": 7.19e-06,
"inf_nan_count": 0
},
{
"step": 18300,
"loss": 6.3637,
"learning_rate": 7.08e-06,
"inf_nan_count": 0
},
{
"step": 18325,
"loss": 6.3583,
"learning_rate": 6.98e-06,
"inf_nan_count": 0
},
{
"step": 18350,
"loss": 6.4469,
"learning_rate": 6.88e-06,
"inf_nan_count": 0
},
{
"step": 18375,
"loss": 6.3768,
"learning_rate": 6.77e-06,
"inf_nan_count": 0
},
{
"step": 18400,
"loss": 6.3179,
"learning_rate": 6.67e-06,
"inf_nan_count": 0
},
{
"step": 18425,
"loss": 6.4046,
"learning_rate": 6.56e-06,
"inf_nan_count": 0
},
{
"step": 18450,
"loss": 6.3435,
"learning_rate": 6.46e-06,
"inf_nan_count": 0
},
{
"step": 18475,
"loss": 6.3454,
"learning_rate": 6.35e-06,
"inf_nan_count": 0
},
{
"step": 18500,
"loss": 6.3922,
"learning_rate": 6.25e-06,
"inf_nan_count": 0
},
{
"step": 18525,
"loss": 6.3459,
"learning_rate": 6.15e-06,
"inf_nan_count": 0
},
{
"step": 18550,
"loss": 6.3591,
"learning_rate": 6.04e-06,
"inf_nan_count": 0
},
{
"step": 18575,
"loss": 6.4337,
"learning_rate": 5.94e-06,
"inf_nan_count": 0
},
{
"step": 18600,
"loss": 6.3962,
"learning_rate": 5.83e-06,
"inf_nan_count": 0
},
{
"step": 18625,
"loss": 6.3425,
"learning_rate": 5.73e-06,
"inf_nan_count": 0
},
{
"step": 18650,
"loss": 6.4022,
"learning_rate": 5.63e-06,
"inf_nan_count": 0
},
{
"step": 18675,
"loss": 6.4513,
"learning_rate": 5.52e-06,
"inf_nan_count": 0
},
{
"step": 18700,
"loss": 6.4284,
"learning_rate": 5.42e-06,
"inf_nan_count": 0
},
{
"step": 18725,
"loss": 6.3879,
"learning_rate": 5.31e-06,
"inf_nan_count": 0
},
{
"step": 18750,
"loss": 6.4009,
"learning_rate": 5.21e-06,
"inf_nan_count": 0
},
{
"step": 18775,
"loss": 6.3713,
"learning_rate": 5.1e-06,
"inf_nan_count": 0
},
{
"step": 18800,
"loss": 6.3752,
"learning_rate": 5e-06,
"inf_nan_count": 0
},
{
"step": 18825,
"loss": 6.4265,
"learning_rate": 4.9e-06,
"inf_nan_count": 0
},
{
"step": 18850,
"loss": 6.3709,
"learning_rate": 4.79e-06,
"inf_nan_count": 0
},
{
"step": 18875,
"loss": 6.3316,
"learning_rate": 4.69e-06,
"inf_nan_count": 0
},
{
"step": 18900,
"loss": 6.4479,
"learning_rate": 4.58e-06,
"inf_nan_count": 0
},
{
"step": 18925,
"loss": 6.4247,
"learning_rate": 4.48e-06,
"inf_nan_count": 0
},
{
"step": 18950,
"loss": 6.4126,
"learning_rate": 4.37e-06,
"inf_nan_count": 0
},
{
"step": 18975,
"loss": 6.3489,
"learning_rate": 4.27e-06,
"inf_nan_count": 0
},
{
"step": 19000,
"loss": 6.325,
"learning_rate": 4.17e-06,
"inf_nan_count": 0
},
{
"step": 19025,
"loss": 6.3306,
"learning_rate": 4.06e-06,
"inf_nan_count": 0
},
{
"step": 19050,
"loss": 6.387,
"learning_rate": 3.96e-06,
"inf_nan_count": 0
},
{
"step": 19075,
"loss": 6.4133,
"learning_rate": 3.85e-06,
"inf_nan_count": 0
},
{
"step": 19100,
"loss": 6.334,
"learning_rate": 3.75e-06,
"inf_nan_count": 0
},
{
"step": 19125,
"loss": 6.3034,
"learning_rate": 3.65e-06,
"inf_nan_count": 0
},
{
"step": 19150,
"loss": 6.4097,
"learning_rate": 3.54e-06,
"inf_nan_count": 0
},
{
"step": 19175,
"loss": 6.442,
"learning_rate": 3.44e-06,
"inf_nan_count": 0
},
{
"step": 19200,
"loss": 6.3756,
"learning_rate": 3.33e-06,
"inf_nan_count": 0
},
{
"step": 19225,
"loss": 6.4037,
"learning_rate": 3.23e-06,
"inf_nan_count": 0
},
{
"step": 19250,
"loss": 6.3974,
"learning_rate": 3.13e-06,
"inf_nan_count": 0
},
{
"step": 19275,
"loss": 6.3933,
"learning_rate": 3.02e-06,
"inf_nan_count": 0
},
{
"step": 19300,
"loss": 6.3269,
"learning_rate": 2.92e-06,
"inf_nan_count": 0
},
{
"step": 19325,
"loss": 6.3907,
"learning_rate": 2.81e-06,
"inf_nan_count": 0
},
{
"step": 19350,
"loss": 6.3955,
"learning_rate": 2.71e-06,
"inf_nan_count": 0
},
{
"step": 19375,
"loss": 6.3972,
"learning_rate": 2.6e-06,
"inf_nan_count": 0
},
{
"step": 19400,
"loss": 6.3896,
"learning_rate": 2.5e-06,
"inf_nan_count": 0
},
{
"step": 19425,
"loss": 6.3425,
"learning_rate": 2.4e-06,
"inf_nan_count": 0
},
{
"step": 19450,
"loss": 6.3587,
"learning_rate": 2.29e-06,
"inf_nan_count": 0
},
{
"step": 19475,
"loss": 6.4179,
"learning_rate": 2.19e-06,
"inf_nan_count": 0
},
{
"step": 19500,
"loss": 6.4192,
"learning_rate": 2.08e-06,
"inf_nan_count": 0
},
{
"step": 19525,
"loss": 6.4252,
"learning_rate": 1.98e-06,
"inf_nan_count": 0
},
{
"step": 19550,
"loss": 6.3349,
"learning_rate": 1.88e-06,
"inf_nan_count": 0
},
{
"step": 19575,
"loss": 6.4042,
"learning_rate": 1.77e-06,
"inf_nan_count": 0
},
{
"step": 19600,
"loss": 6.3567,
"learning_rate": 1.67e-06,
"inf_nan_count": 0
},
{
"step": 19625,
"loss": 6.3912,
"learning_rate": 1.56e-06,
"inf_nan_count": 0
},
{
"step": 19650,
"loss": 6.3113,
"learning_rate": 1.46e-06,
"inf_nan_count": 0
},
{
"step": 19675,
"loss": 6.3756,
"learning_rate": 1.35e-06,
"inf_nan_count": 0
},
{
"step": 19700,
"loss": 6.385,
"learning_rate": 1.25e-06,
"inf_nan_count": 0
},
{
"step": 19725,
"loss": 6.3631,
"learning_rate": 1.15e-06,
"inf_nan_count": 0
},
{
"step": 19750,
"loss": 6.4564,
"learning_rate": 1.04e-06,
"inf_nan_count": 0
},
{
"step": 19775,
"loss": 6.3258,
"learning_rate": 9.38e-07,
"inf_nan_count": 0
},
{
"step": 19800,
"loss": 6.4682,
"learning_rate": 8.33e-07,
"inf_nan_count": 0
},
{
"step": 19825,
"loss": 6.4421,
"learning_rate": 7.29e-07,
"inf_nan_count": 0
},
{
"step": 19850,
"loss": 6.4342,
"learning_rate": 6.25e-07,
"inf_nan_count": 0
},
{
"step": 19875,
"loss": 6.4182,
"learning_rate": 5.21e-07,
"inf_nan_count": 0
},
{
"step": 19900,
"loss": 6.3203,
"learning_rate": 4.17e-07,
"inf_nan_count": 0
},
{
"step": 19925,
"loss": 6.4339,
"learning_rate": 3.13e-07,
"inf_nan_count": 0
},
{
"step": 19950,
"loss": 6.4095,
"learning_rate": 2.08e-07,
"inf_nan_count": 0
},
{
"step": 19975,
"loss": 6.4814,
"learning_rate": 1.04e-07,
"inf_nan_count": 0
}
],
"evaluation_results": [
{
"step": 1000,
"paloma": 7.125172406420199e+27
},
{
"step": 1500,
"paloma": 6.5469212698356e+18
},
{
"step": 2000,
"paloma": 5.118641309912889e+18
},
{
"step": 2500,
"paloma": 3.37924315167126e+18
},
{
"step": 3000,
"paloma": 6.892747900243237e+18
},
{
"step": 3500,
"paloma": 2.0436832271954907e+19
},
{
"step": 4000,
"paloma": 4.1410268232311005e+19
},
{
"step": 4500,
"paloma": 3.4524340411684053e+19
},
{
"step": 5000,
"paloma": 2.320698426399461e+19
},
{
"step": 5500,
"paloma": 3.1834097890526753e+19
},
{
"step": 6000,
"paloma": 4.457139025979801e+19
},
{
"step": 6500,
"paloma": 7.3062353841856406e+19
},
{
"step": 7000,
"paloma": 1.2357969480287024e+20
},
{
"step": 7500,
"paloma": 2.7199371732053928e+20
},
{
"step": 8000,
"paloma": 7.181862506006892e+20
},
{
"step": 8500,
"paloma": 1.5123285241831744e+21
},
{
"step": 9000,
"paloma": 3.573074534351724e+21
},
{
"step": 9500,
"paloma": 7.403721262078652e+21
},
{
"step": 10000,
"paloma": 1.0650515380055143e+22
},
{
"step": 10500,
"paloma": 2.1077589258137904e+22
},
{
"step": 11000,
"paloma": 2.712416409262884e+22
},
{
"step": 11500,
"paloma": 4.877238989481918e+22
},
{
"step": 12000,
"paloma": 7.219509956260661e+22
},
{
"step": 12500,
"paloma": 1.1729325953411656e+23
},
{
"step": 13000,
"paloma": 1.729306754923583e+23
},
{
"step": 13500,
"paloma": 2.4018454768029128e+23
},
{
"step": 14000,
"paloma": 3.247328955167052e+23
},
{
"step": 14500,
"paloma": 4.43239578722337e+23
},
{
"step": 15000,
"paloma": 5.215164570276226e+23
},
{
"step": 15500,
"paloma": 6.102665947946271e+23
},
{
"step": 16000,
"paloma": 8.874629945146669e+23
},
{
"step": 16500,
"paloma": 9.981607121011733e+23
},
{
"step": 17000,
"paloma": 1.1075349421086151e+24
},
{
"step": 17500,
"paloma": 1.1064948792133394e+24
},
{
"step": 18000,
"paloma": 1.340918782615931e+24
},
{
"step": 18500,
"paloma": 1.4325241176004668e+24
},
{
"step": 19000,
"paloma": 1.5360601246943468e+24
},
{
"step": 19500,
"paloma": 1.6346615942991742e+24
},
{
"step": 20000,
"paloma": 1.645368302099182e+24
}
],
"config": {
"d_model": 96,
"n_layers": 12,
"max_seq_len": 2048,
"vocab_size": 50304,
"lr": 5e-05,
"max_steps": 20000,
"batch_size": 1
}
},
{
"run_name": "pico-decoder-tiny-dolma29k-v1",
"log_file": "log_20250828_225300.log",
"training_metrics": [
{
"step": 1000,
"loss": 7.7657,
"learning_rate": 0.00012,
"inf_nan_count": 0
},
{
"step": 1100,
"loss": 7.6733,
"learning_rate": 0.000132,
"inf_nan_count": 0
},
{
"step": 1200,
"loss": 7.5969,
"learning_rate": 0.000144,
"inf_nan_count": 0
},
{
"step": 1300,
"loss": 7.4765,
"learning_rate": 0.000156,
"inf_nan_count": 0
},
{
"step": 1400,
"loss": 7.3686,
"learning_rate": 0.000168,
"inf_nan_count": 0
},
{
"step": 1500,
"loss": 7.3251,
"learning_rate": 0.00018,
"inf_nan_count": 0
},
{
"step": 1600,
"loss": 7.184,
"learning_rate": 0.000192,
"inf_nan_count": 0
},
{
"step": 1700,
"loss": 7.1116,
"learning_rate": 0.000204,
"inf_nan_count": 0
},
{
"step": 1800,
"loss": 7.0565,
"learning_rate": 0.000216,
"inf_nan_count": 0
},
{
"step": 1900,
"loss": 6.9964,
"learning_rate": 0.000228,
"inf_nan_count": 0
},
{
"step": 2000,
"loss": 6.969,
"learning_rate": 0.00024,
"inf_nan_count": 0
},
{
"step": 2100,
"loss": 6.884,
"learning_rate": 0.000252,
"inf_nan_count": 0
},
{
"step": 2200,
"loss": 6.8334,
"learning_rate": 0.000264,
"inf_nan_count": 0
},
{
"step": 2300,
"loss": 6.815,
"learning_rate": 0.000276,
"inf_nan_count": 0
},
{
"step": 2400,
"loss": 6.7519,
"learning_rate": 0.000288,
"inf_nan_count": 0
},
{
"step": 2500,
"loss": 6.6908,
"learning_rate": 0.0003,
"inf_nan_count": 0
},
{
"step": 2600,
"loss": 6.6351,
"learning_rate": 0.0003,
"inf_nan_count": 0
},
{
"step": 2700,
"loss": 6.5568,
"learning_rate": 0.0003,
"inf_nan_count": 0
},
{
"step": 2800,
"loss": 6.5799,
"learning_rate": 0.0003,
"inf_nan_count": 0
},
{
"step": 2900,
"loss": 6.5467,
"learning_rate": 0.000299,
"inf_nan_count": 0
},
{
"step": 3000,
"loss": 6.4865,
"learning_rate": 0.000299,
"inf_nan_count": 0
},
{
"step": 3100,
"loss": 6.4604,
"learning_rate": 0.000299,
"inf_nan_count": 0
},
{
"step": 3200,
"loss": 6.4205,
"learning_rate": 0.000299,
"inf_nan_count": 0
},
{
"step": 3300,
"loss": 6.4127,
"learning_rate": 0.000299,
"inf_nan_count": 0
},
{
"step": 3400,
"loss": 6.3692,
"learning_rate": 0.000299,
"inf_nan_count": 0
},
{
"step": 3500,
"loss": 6.3761,
"learning_rate": 0.000298,
"inf_nan_count": 0
},
{
"step": 3600,
"loss": 6.2796,
"learning_rate": 0.000298,
"inf_nan_count": 0
},
{
"step": 3700,
"loss": 6.2988,
"learning_rate": 0.000298,
"inf_nan_count": 0
},
{
"step": 3800,
"loss": 6.2673,
"learning_rate": 0.000298,
"inf_nan_count": 0
},
{
"step": 3900,
"loss": 6.2715,
"learning_rate": 0.000298,
"inf_nan_count": 0
},
{
"step": 4000,
"loss": 6.189,
"learning_rate": 0.000298,
"inf_nan_count": 0
},
{
"step": 4100,
"loss": 6.1832,
"learning_rate": 0.000298,
"inf_nan_count": 0
},
{
"step": 4200,
"loss": 6.1553,
"learning_rate": 0.000297,
"inf_nan_count": 0
},
{
"step": 4300,
"loss": 6.1629,
"learning_rate": 0.000297,
"inf_nan_count": 0
},
{
"step": 4400,
"loss": 6.1061,
"learning_rate": 0.000297,
"inf_nan_count": 0
},
{
"step": 4500,
"loss": 6.1601,
"learning_rate": 0.000297,
"inf_nan_count": 0
},
{
"step": 4600,
"loss": 6.0963,
"learning_rate": 0.000297,
"inf_nan_count": 0
},
{
"step": 4700,
"loss": 6.078,
"learning_rate": 0.000297,
"inf_nan_count": 0
},
{
"step": 4800,
"loss": 6.0835,
"learning_rate": 0.000297,
"inf_nan_count": 0
},
{
"step": 4900,
"loss": 6.0519,
"learning_rate": 0.000296,
"inf_nan_count": 0
},
{
"step": 5000,
"loss": 6.0661,
"learning_rate": 0.000296,
"inf_nan_count": 0
},
{
"step": 5100,
"loss": 6.0121,
"learning_rate": 0.000296,
"inf_nan_count": 0
},
{
"step": 5200,
"loss": 6.0544,
"learning_rate": 0.000296,
"inf_nan_count": 0
},
{
"step": 5300,
"loss": 6.0224,
"learning_rate": 0.000296,
"inf_nan_count": 0
},
{
"step": 5400,
"loss": 5.9831,
"learning_rate": 0.000296,
"inf_nan_count": 0
},
{
"step": 5500,
"loss": 5.9553,
"learning_rate": 0.000295,
"inf_nan_count": 0
},
{
"step": 5600,
"loss": 5.9493,
"learning_rate": 0.000295,
"inf_nan_count": 0
},
{
"step": 5700,
"loss": 5.9943,
"learning_rate": 0.000295,
"inf_nan_count": 0
},
{
"step": 5800,
"loss": 5.963,
"learning_rate": 0.000295,
"inf_nan_count": 0
},
{
"step": 5900,
"loss": 5.9349,
"learning_rate": 0.000295,
"inf_nan_count": 0
},
{
"step": 6000,
"loss": 5.9087,
"learning_rate": 0.000295,
"inf_nan_count": 0
},
{
"step": 6100,
"loss": 5.8818,
"learning_rate": 0.000295,
"inf_nan_count": 0
},
{
"step": 6200,
"loss": 5.8535,
"learning_rate": 0.000294,
"inf_nan_count": 0
},
{
"step": 6300,
"loss": 5.8896,
"learning_rate": 0.000294,
"inf_nan_count": 0
},
{
"step": 6400,
"loss": 5.9007,
"learning_rate": 0.000294,
"inf_nan_count": 0
},
{
"step": 6500,
"loss": 5.8617,
"learning_rate": 0.000294,
"inf_nan_count": 0
},
{
"step": 6600,
"loss": 5.8201,
"learning_rate": 0.000294,
"inf_nan_count": 0
},
{
"step": 6700,
"loss": 5.8544,
"learning_rate": 0.000294,
"inf_nan_count": 0
},
{
"step": 6800,
"loss": 5.8532,
"learning_rate": 0.000293,
"inf_nan_count": 0
},
{
"step": 6900,
"loss": 5.795,
"learning_rate": 0.000293,
"inf_nan_count": 0
},
{
"step": 7000,
"loss": 5.8146,
"learning_rate": 0.000293,
"inf_nan_count": 0
},
{
"step": 7100,
"loss": 5.793,
"learning_rate": 0.000293,
"inf_nan_count": 0
},
{
"step": 7200,
"loss": 5.7827,
"learning_rate": 0.000293,
"inf_nan_count": 0
},
{
"step": 7300,
"loss": 5.7816,
"learning_rate": 0.000293,
"inf_nan_count": 0
},
{
"step": 7400,
"loss": 5.73,
"learning_rate": 0.000293,
"inf_nan_count": 0
},
{
"step": 7500,
"loss": 5.767,
"learning_rate": 0.000292,
"inf_nan_count": 0
},
{
"step": 7600,
"loss": 5.745,
"learning_rate": 0.000292,
"inf_nan_count": 0
},
{
"step": 7700,
"loss": 5.7499,
"learning_rate": 0.000292,
"inf_nan_count": 0
},
{
"step": 7800,
"loss": 5.7233,
"learning_rate": 0.000292,
"inf_nan_count": 0
},
{
"step": 7900,
"loss": 5.7219,
"learning_rate": 0.000292,
"inf_nan_count": 0
},
{
"step": 8000,
"loss": 5.7523,
"learning_rate": 0.000292,
"inf_nan_count": 0
},
{
"step": 8100,
"loss": 5.7145,
"learning_rate": 0.000291,
"inf_nan_count": 0
},
{
"step": 8200,
"loss": 5.7469,
"learning_rate": 0.000291,
"inf_nan_count": 0
},
{
"step": 8300,
"loss": 5.7363,
"learning_rate": 0.000291,
"inf_nan_count": 0
},
{
"step": 8400,
"loss": 5.6938,
"learning_rate": 0.000291,
"inf_nan_count": 0
},
{
"step": 8500,
"loss": 5.6994,
"learning_rate": 0.000291,
"inf_nan_count": 0
},
{
"step": 8600,
"loss": 5.6583,
"learning_rate": 0.000291,
"inf_nan_count": 0
},
{
"step": 8700,
"loss": 5.6885,
"learning_rate": 0.000291,
"inf_nan_count": 0
},
{
"step": 8800,
"loss": 5.6313,
"learning_rate": 0.00029,
"inf_nan_count": 0
},
{
"step": 8900,
"loss": 5.6314,
"learning_rate": 0.00029,
"inf_nan_count": 0
},
{
"step": 9000,
"loss": 5.6501,
"learning_rate": 0.00029,
"inf_nan_count": 0
},
{
"step": 9100,
"loss": 5.6357,
"learning_rate": 0.00029,
"inf_nan_count": 0
},
{
"step": 9200,
"loss": 5.6045,
"learning_rate": 0.00029,
"inf_nan_count": 0
},
{
"step": 9300,
"loss": 5.6405,
"learning_rate": 0.00029,
"inf_nan_count": 0
},
{
"step": 9400,
"loss": 5.6241,
"learning_rate": 0.00029,
"inf_nan_count": 0
},
{
"step": 9500,
"loss": 5.6247,
"learning_rate": 0.000289,
"inf_nan_count": 0
},
{
"step": 9600,
"loss": 5.5983,
"learning_rate": 0.000289,
"inf_nan_count": 0
},
{
"step": 9700,
"loss": 5.5978,
"learning_rate": 0.000289,
"inf_nan_count": 0
},
{
"step": 9800,
"loss": 5.5746,
"learning_rate": 0.000289,
"inf_nan_count": 0
}
],
"evaluation_results": [
{
"step": 1000,
"paloma": 2.5468931158531133e+19
},
{
"step": 2000,
"paloma": 3.627192449295412e+21
},
{
"step": 3000,
"paloma": 9.90975658825673e+22
},
{
"step": 4000,
"paloma": 2.6252526658823776e+24
},
{
"step": 5000,
"paloma": 7.294956881845611e+25
},
{
"step": 6000,
"paloma": 1.6856570425562805e+27
},
{
"step": 7000,
"paloma": 9.22180682233585e+28
},
{
"step": 8000,
"paloma": 3.1300823362207656e+29
},
{
"step": 9000,
"paloma": 4.983924509492406e+30
}
],
"config": {
"d_model": 96,
"n_layers": 12,
"max_seq_len": 2048,
"vocab_size": 50304,
"lr": 0.0003,
"max_steps": 200000,
"batch_size": 1
}
},
{
"run_name": "pico-decoder-tiny-dolma-teensy-v0",
"log_file": "log_20250828_210922.log",
"training_metrics": [
{
"step": 0,
"loss": 10.9914,
"learning_rate": 0.0,
"inf_nan_count": 0
}
],
"evaluation_results": [
{
"step": 0,
"paloma": 59434.76600609756
},
{
"step": 27,
"paloma": 59120.39268292683
}
],
"config": {
"d_model": 96,
"n_layers": 12,
"max_seq_len": 2048,
"vocab_size": 50304,
"lr": 0.0003,
"max_steps": 200000,
"batch_size": 8
}
},
{
"run_name": "pico-decoder-tiny-dolma-teensy-v1",
"log_file": "log_20250828_220514.log",
"training_metrics": [
{
"step": 0,
"loss": 10.9886,
"learning_rate": 0.0,
"inf_nan_count": 0
},
{
"step": 100,
"loss": 10.9373,
"learning_rate": 1.2e-05,
"inf_nan_count": 0
},
{
"step": 200,
"loss": 10.5423,
"learning_rate": 2.4e-05,
"inf_nan_count": 0
},
{
"step": 300,
"loss": 9.9452,
"learning_rate": 3.6e-05,
"inf_nan_count": 0
},
{
"step": 400,
"loss": 9.449,
"learning_rate": 4.8e-05,
"inf_nan_count": 0
},
{
"step": 500,
"loss": 8.8455,
"learning_rate": 6e-05,
"inf_nan_count": 0
},
{
"step": 600,
"loss": 8.1482,
"learning_rate": 7.2e-05,
"inf_nan_count": 0
},
{
"step": 700,
"loss": 7.4303,
"learning_rate": 8.4e-05,
"inf_nan_count": 0
},
{
"step": 800,
"loss": 7.0363,
"learning_rate": 9.6e-05,
"inf_nan_count": 0
},
{
"step": 900,
"loss": 6.9702,
"learning_rate": 0.000108,
"inf_nan_count": 0
},
{
"step": 1000,
"loss": 6.8975,
"learning_rate": 0.00012,
"inf_nan_count": 0
},
{
"step": 1100,
"loss": 6.892,
"learning_rate": 0.000132,
"inf_nan_count": 0
},
{
"step": 1200,
"loss": 6.6684,
"learning_rate": 0.000144,
"inf_nan_count": 0
},
{
"step": 1300,
"loss": 6.4754,
"learning_rate": 0.000156,
"inf_nan_count": 0
},
{
"step": 1400,
"loss": 6.3649,
"learning_rate": 0.000168,
"inf_nan_count": 0
},
{
"step": 1500,
"loss": 6.2981,
"learning_rate": 0.00018,
"inf_nan_count": 0
},
{
"step": 1600,
"loss": 6.1551,
"learning_rate": 0.000192,
"inf_nan_count": 0
},
{
"step": 1700,
"loss": 5.9163,
"learning_rate": 0.000204,
"inf_nan_count": 0
}
],
"evaluation_results": [
{
"step": 1000,
"paloma": 9.54583880403771e+19
},
{
"step": 1755,
"paloma": 2.945795672816324e+21
}
],
"config": {
"d_model": 96,
"n_layers": 12,
"max_seq_len": 2048,
"vocab_size": 50304,
"lr": 0.0003,
"max_steps": 200000,
"batch_size": 4
}
},
{
"run_name": "pico-decoder-tiny-dolma5M-v1",
"log_file": "log_20250830_014108.log",
"training_metrics": [
{
"step": 32000,
"loss": 6.3376,
"learning_rate": 7.32e-06,
"inf_nan_count": 0
},
{
"step": 32025,
"loss": 6.1999,
"learning_rate": 7.28e-06,
"inf_nan_count": 0
},
{
"step": 32050,
"loss": 6.1488,
"learning_rate": 7.24e-06,
"inf_nan_count": 0
},
{
"step": 32075,
"loss": 6.046,
"learning_rate": 7.19e-06,
"inf_nan_count": 0
}
],
"evaluation_results": [
{
"step": 32000,
"paloma": 2.977755235898109e+26
}
],
"config": {
"d_model": 96,
"n_layers": 12,
"max_seq_len": 2048,
"vocab_size": 50304,
"lr": 5e-05,
"max_steps": 20000,
"batch_size": 1
}
}
],
"summary": {
"total_runs": 6,
"run_names": [
"pico-decoder-tiny-dolma29k-v2",
"pico-decoder-tiny-dolma29k-v3",
"pico-decoder-tiny-dolma29k-v1",
"pico-decoder-tiny-dolma-teensy-v0",
"pico-decoder-tiny-dolma-teensy-v1",
"pico-decoder-tiny-dolma5M-v1"
]
}
}